From cffcdbff70a3cf3cc4f42fa2e57012411b1c78a6 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 1 Oct 2020 15:54:40 -0700
Subject: [PATCH 01/62] selftests/bpf: Initialize duration in xdp_noinline.c

Fixes clang error:
tools/testing/selftests/bpf/prog_tests/xdp_noinline.c:35:6: error: variable 'duration' is uninitialized when used here [-Werror,-Wuninitialized]
        if (CHECK(!skel, "skel_open_and_load", "failed\n"))
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20201001225440.1373233-1-sdf@google.com
---
 tools/testing/selftests/bpf/prog_tests/xdp_noinline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
index a1f06424cf83..0281095de266 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
@@ -25,7 +25,7 @@ void test_xdp_noinline(void)
 		__u8 flags;
 	} real_def = {.dst = MAGIC_VAL};
 	__u32 ch_key = 11, real_num = 3;
-	__u32 duration, retval, size;
+	__u32 duration = 0, retval, size;
 	int err, i;
 	__u64 bytes = 0, pkts = 0;
 	char buf[128];

From 48ca6243c6adf1a3d7e7a0fd519ba918eb7b00ca Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 1 Oct 2020 17:04:51 -0700
Subject: [PATCH 02/62] selftests/bpf: Properly initialize linfo in
 sockmap_basic

When using -Werror=missing-braces, compiler complains about missing braces.
Let's use use ={} initialization which should do the job:

tools/testing/selftests/bpf/prog_tests/sockmap_basic.c: In function 'test_sockmap_iter':
tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:181:8: error: missing braces around initializer [-Werror=missing-braces]
  union bpf_iter_link_info linfo = {0};
        ^
tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:181:8: error: (near initialization for 'linfo.map') [-Werror=missing-braces]
tools/testing/selftests/bpf/prog_tests/sockmap_basic.c: At top level:

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20201002000451.1794044-1-sdf@google.com
---
 tools/testing/selftests/bpf/prog_tests/sockmap_basic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 4c4224e3e10a..85f73261fab0 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -198,7 +198,7 @@ static void test_sockmap_copy(enum bpf_map_type map_type)
 {
 	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
 	int err, len, src_fd, iter_fd, duration = 0;
-	union bpf_iter_link_info linfo = {0};
+	union bpf_iter_link_info linfo = {};
 	__u32 i, num_sockets, num_elems;
 	struct bpf_iter_sockmap *skel;
 	__s64 *sock_fd = NULL;

From 82f45c6c4a70622cc0585e3f4372e192a6491d26 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Oct 2020 18:34:48 -0700
Subject: [PATCH 03/62] bpf: tcp: Do not limit cb_flags when creating child sk
 from listen sk

The commit 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option")
unnecessarily introduced bpf_skops_init_child() which limited the child
sk from inheriting all bpf_sock_ops_cb_flags of the listen sk.  That
breaks existing user expectation.

This patch removes the bpf_skops_init_child() and just allows
sock_copy() to do its job to copy everything from listen sk to
the child sk.

Fixes: 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option")
Reported-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201002013448.2542025-1-kafai@fb.com
---
 include/net/tcp.h        | 33 ---------------------------------
 net/ipv4/tcp_minisocks.c |  1 -
 2 files changed, 34 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3601dea931a6..d4ef5bf94168 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2228,34 +2228,6 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 #endif /* CONFIG_NET_SOCK_MSG */
 
 #ifdef CONFIG_CGROUP_BPF
-/* Copy the listen sk's HDR_OPT_CB flags to its child.
- *
- * During 3-Way-HandShake, the synack is usually sent from
- * the listen sk with the HDR_OPT_CB flags set so that
- * bpf-prog will be called to write the BPF hdr option.
- *
- * In fastopen, the child sk is used to send synack instead
- * of the listen sk.  Thus, inheriting the HDR_OPT_CB flags
- * from the listen sk gives the bpf-prog a chance to write
- * BPF hdr option in the synack pkt during fastopen.
- *
- * Both fastopen and non-fastopen child will inherit the
- * HDR_OPT_CB flags to keep the bpf-prog having a consistent
- * behavior when deciding to clear this cb flags (or not)
- * during the PASSIVE_ESTABLISHED_CB.
- *
- * In the future, other cb flags could be inherited here also.
- */
-static inline void bpf_skops_init_child(const struct sock *sk,
-					struct sock *child)
-{
-	tcp_sk(child)->bpf_sock_ops_cb_flags =
-		tcp_sk(sk)->bpf_sock_ops_cb_flags &
-		(BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG |
-		 BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
-		 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG);
-}
-
 static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
 				      struct sk_buff *skb,
 				      unsigned int end_offset)
@@ -2264,11 +2236,6 @@ static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
 	skops->skb_data_end = skb->data + end_offset;
 }
 #else
-static inline void bpf_skops_init_child(const struct sock *sk,
-					struct sock *child)
-{
-}
-
 static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
 				      struct sk_buff *skb,
 				      unsigned int end_offset)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 56c306e3cd2f..495dda2449fe 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -548,7 +548,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	newtp->fastopen_req = NULL;
 	RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
 
-	bpf_skops_init_child(sk, newsk);
 	tcp_bpf_clone(sk, newsk);
 
 	__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);

From 96d46c508506136ed35c4b02d74ce38e91d88421 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Oct 2020 18:34:54 -0700
Subject: [PATCH 04/62] bpf: selftest: Ensure the child sk inherited all
 bpf_sock_ops_cb_flags

This patch adds a test to ensure the child sk inherited everything
from the bpf_sock_ops_cb_flags of the listen sk:
1. Sets one more cb_flags (BPF_SOCK_OPS_STATE_CB_FLAG) to the listen sk
   in test_tcp_hdr_options.c
2. Saves the skops->bpf_sock_ops_cb_flags when handling the newly
   established passive connection
3. CHECK() it is the same as the listen sk

This also covers the fastopen case as the existing test_tcp_hdr_options.c
does.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201002013454.2542367-1-kafai@fb.com
---
 .../selftests/bpf/prog_tests/tcp_hdr_options.c       | 12 ++++++++++++
 .../selftests/bpf/progs/test_misc_tcp_hdr_options.c  |  4 ++--
 .../selftests/bpf/progs/test_tcp_hdr_options.c       |  7 +++++--
 tools/testing/selftests/bpf/test_tcp_hdr_options.h   |  5 +++--
 4 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
index 24ba0d21b641..c86e67214a9e 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
@@ -264,9 +264,19 @@ static int check_error_linum(const struct sk_fds *sk_fds)
 
 static void check_hdr_and_close_fds(struct sk_fds *sk_fds)
 {
+	const __u32 expected_inherit_cb_flags =
+		BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
+		BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG |
+		BPF_SOCK_OPS_STATE_CB_FLAG;
+
 	if (sk_fds_shutdown(sk_fds))
 		goto check_linum;
 
+	if (CHECK(expected_inherit_cb_flags != skel->bss->inherit_cb_flags,
+		  "Unexpected inherit_cb_flags", "0x%x != 0x%x\n",
+		  skel->bss->inherit_cb_flags, expected_inherit_cb_flags))
+		goto check_linum;
+
 	if (check_hdr_stg(&exp_passive_hdr_stg, sk_fds->passive_fd,
 			  "passive_hdr_stg"))
 		goto check_linum;
@@ -321,6 +331,8 @@ static void reset_test(void)
 	memset(&skel->bss->active_estab_in, 0, optsize);
 	memset(&skel->bss->active_fin_in, 0, optsize);
 
+	skel->bss->inherit_cb_flags = 0;
+
 	skel->data->test_kind = TCPOPT_EXP;
 	skel->data->test_magic = 0xeB9F;
 
diff --git a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
index 3a216d1d0226..72ec0178f653 100644
--- a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
+++ b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
@@ -304,10 +304,10 @@ int misc_estab(struct bpf_sock_ops *skops)
 		passive_lport_n = __bpf_htons(passive_lport_h);
 		bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
 			       &true_val, sizeof(true_val));
-		set_hdr_cb_flags(skops);
+		set_hdr_cb_flags(skops, 0);
 		break;
 	case BPF_SOCK_OPS_TCP_CONNECT_CB:
-		set_hdr_cb_flags(skops);
+		set_hdr_cb_flags(skops, 0);
 		break;
 	case BPF_SOCK_OPS_PARSE_HDR_OPT_CB:
 		return handle_parse_hdr(skops);
diff --git a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
index 9197a23df3da..678bd0fad29e 100644
--- a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
+++ b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
@@ -21,6 +21,7 @@
 
 __u8 test_kind = TCPOPT_EXP;
 __u16 test_magic = 0xeB9F;
+__u32 inherit_cb_flags = 0;
 
 struct bpf_test_option passive_synack_out = {};
 struct bpf_test_option passive_fin_out	= {};
@@ -467,6 +468,8 @@ static int handle_passive_estab(struct bpf_sock_ops *skops)
 	struct tcphdr *th;
 	int err;
 
+	inherit_cb_flags = skops->bpf_sock_ops_cb_flags;
+
 	err = load_option(skops, &passive_estab_in, true);
 	if (err == -ENOENT) {
 		/* saved_syn is not found. It was in syncookie mode.
@@ -600,10 +603,10 @@ int estab(struct bpf_sock_ops *skops)
 	case BPF_SOCK_OPS_TCP_LISTEN_CB:
 		bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
 			       &true_val, sizeof(true_val));
-		set_hdr_cb_flags(skops);
+		set_hdr_cb_flags(skops, BPF_SOCK_OPS_STATE_CB_FLAG);
 		break;
 	case BPF_SOCK_OPS_TCP_CONNECT_CB:
-		set_hdr_cb_flags(skops);
+		set_hdr_cb_flags(skops, 0);
 		break;
 	case BPF_SOCK_OPS_PARSE_HDR_OPT_CB:
 		return handle_parse_hdr(skops);
diff --git a/tools/testing/selftests/bpf/test_tcp_hdr_options.h b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
index 78a8cf9eab42..6118e3ab61fc 100644
--- a/tools/testing/selftests/bpf/test_tcp_hdr_options.h
+++ b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
@@ -110,12 +110,13 @@ static inline void clear_hdr_cb_flags(struct bpf_sock_ops *skops)
 				    BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG));
 }
 
-static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops)
+static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops, __u32 extra)
 {
 	bpf_sock_ops_cb_flags_set(skops,
 				  skops->bpf_sock_ops_cb_flags |
 				  BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
-				  BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG);
+				  BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG |
+				  extra);
 }
 static inline void
 clear_parse_all_hdr_cb_flags(struct bpf_sock_ops *skops)

From 4976b718c3551faba2c0616ef55ebeb74db1c5ca Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Tue, 29 Sep 2020 16:50:44 -0700
Subject: [PATCH 05/62] bpf: Introduce pseudo_btf_id

Pseudo_btf_id is a type of ld_imm insn that associates a btf_id to a
ksym so that further dereferences on the ksym can use the BTF info
to validate accesses. Internally, when seeing a pseudo_btf_id ld insn,
the verifier reads the btf_id stored in the insn[0]'s imm field and
marks the dst_reg as PTR_TO_BTF_ID. The btf_id points to a VAR_KIND,
which is encoded in btf_vminux by pahole. If the VAR is not of a struct
type, the dst reg will be marked as PTR_TO_MEM instead of PTR_TO_BTF_ID
and the mem_size is resolved to the size of the VAR's type.

>From the VAR btf_id, the verifier can also read the address of the
ksym's corresponding kernel var from kallsyms and use that to fill
dst_reg.

Therefore, the proper functionality of pseudo_btf_id depends on (1)
kallsyms and (2) the encoding of kernel global VARs in pahole, which
should be available since pahole v1.18.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200929235049.2533242-2-haoluo@google.com
---
 include/linux/bpf_verifier.h   |   7 ++
 include/linux/btf.h            |  15 ++++
 include/uapi/linux/bpf.h       |  34 ++++++---
 kernel/bpf/btf.c               |  15 ----
 kernel/bpf/verifier.c          | 125 +++++++++++++++++++++++++++++----
 tools/include/uapi/linux/bpf.h |  34 ++++++---
 6 files changed, 186 insertions(+), 44 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 363b4f1c562a..e83ef6f6bf43 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -308,6 +308,13 @@ struct bpf_insn_aux_data {
 			u32 map_index;		/* index into used_maps[] */
 			u32 map_off;		/* offset from value base address */
 		};
+		struct {
+			enum bpf_reg_type reg_type;	/* type of pseudo_btf_id */
+			union {
+				u32 btf_id;	/* btf_id for struct typed var */
+				u32 mem_size;	/* mem_size for non-struct typed var */
+			};
+		} btf_var;
 	};
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 024e16ff7dcc..af1244180588 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -145,6 +145,21 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
 }
 
+static inline bool btf_type_is_var(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
+}
+
+/* union is only a special case of struct:
+ * all its offsetof(member) == 0
+ */
+static inline bool btf_type_is_struct(const struct btf_type *t)
+{
+	u8 kind = BTF_INFO_KIND(t->info);
+
+	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
 static inline u16 btf_type_vlen(const struct btf_type *t)
 {
 	return BTF_INFO_VLEN(t->info);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4f556cfcbfbe..2aa156af24d6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -356,18 +356,36 @@ enum bpf_link_type {
 #define BPF_F_SLEEPABLE		(1U << 4)
 
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
- * two extensions:
+ * the following extensions:
  *
- * insn[0].src_reg:  BPF_PSEUDO_MAP_FD   BPF_PSEUDO_MAP_VALUE
- * insn[0].imm:      map fd              map fd
- * insn[1].imm:      0                   offset into value
- * insn[0].off:      0                   0
- * insn[1].off:      0                   0
- * ldimm64 rewrite:  address of map      address of map[0]+offset
- * verifier type:    CONST_PTR_TO_MAP    PTR_TO_MAP_VALUE
+ * insn[0].src_reg:  BPF_PSEUDO_MAP_FD
+ * insn[0].imm:      map fd
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of map
+ * verifier type:    CONST_PTR_TO_MAP
  */
 #define BPF_PSEUDO_MAP_FD	1
+/* insn[0].src_reg:  BPF_PSEUDO_MAP_VALUE
+ * insn[0].imm:      map fd
+ * insn[1].imm:      offset into value
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of map[0]+offset
+ * verifier type:    PTR_TO_MAP_VALUE
+ */
 #define BPF_PSEUDO_MAP_VALUE	2
+/* insn[0].src_reg:  BPF_PSEUDO_BTF_ID
+ * insn[0].imm:      kernel btd id of VAR
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the kernel variable
+ * verifier type:    PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var
+ *                   is struct/union.
+ */
+#define BPF_PSEUDO_BTF_ID	3
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4d0ee7839fdb..00569afe3d0d 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -440,16 +440,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
 	return !t || btf_type_nosize(t);
 }
 
-/* union is only a special case of struct:
- * all its offsetof(member) == 0
- */
-static bool btf_type_is_struct(const struct btf_type *t)
-{
-	u8 kind = BTF_INFO_KIND(t->info);
-
-	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
-}
-
 static bool __btf_type_is_struct(const struct btf_type *t)
 {
 	return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
@@ -460,11 +450,6 @@ static bool btf_type_is_array(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
 }
 
-static bool btf_type_is_var(const struct btf_type *t)
-{
-	return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
-}
-
 static bool btf_type_is_datasec(const struct btf_type *t)
 {
 	return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 015a1c074b6b..fe4965079773 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7488,6 +7488,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
 	struct bpf_insn_aux_data *aux = cur_aux(env);
 	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *dst_reg;
 	struct bpf_map *map;
 	int err;
 
@@ -7504,25 +7505,44 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	if (err)
 		return err;
 
+	dst_reg = &regs[insn->dst_reg];
 	if (insn->src_reg == 0) {
 		u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
 
-		regs[insn->dst_reg].type = SCALAR_VALUE;
+		dst_reg->type = SCALAR_VALUE;
 		__mark_reg_known(&regs[insn->dst_reg], imm);
 		return 0;
 	}
 
+	if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
+		mark_reg_known_zero(env, regs, insn->dst_reg);
+
+		dst_reg->type = aux->btf_var.reg_type;
+		switch (dst_reg->type) {
+		case PTR_TO_MEM:
+			dst_reg->mem_size = aux->btf_var.mem_size;
+			break;
+		case PTR_TO_BTF_ID:
+			dst_reg->btf_id = aux->btf_var.btf_id;
+			break;
+		default:
+			verbose(env, "bpf verifier is misconfigured\n");
+			return -EFAULT;
+		}
+		return 0;
+	}
+
 	map = env->used_maps[aux->map_index];
 	mark_reg_known_zero(env, regs, insn->dst_reg);
-	regs[insn->dst_reg].map_ptr = map;
+	dst_reg->map_ptr = map;
 
 	if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
-		regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
-		regs[insn->dst_reg].off = aux->map_off;
+		dst_reg->type = PTR_TO_MAP_VALUE;
+		dst_reg->off = aux->map_off;
 		if (map_value_has_spin_lock(map))
-			regs[insn->dst_reg].id = ++env->id_gen;
+			dst_reg->id = ++env->id_gen;
 	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
-		regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+		dst_reg->type = CONST_PTR_TO_MAP;
 	} else {
 		verbose(env, "bpf verifier is misconfigured\n");
 		return -EINVAL;
@@ -9424,6 +9444,73 @@ process_bpf_exit:
 	return 0;
 }
 
+/* replace pseudo btf_id with kernel symbol address */
+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
+			       struct bpf_insn *insn,
+			       struct bpf_insn_aux_data *aux)
+{
+	u32 type, id = insn->imm;
+	const struct btf_type *t;
+	const char *sym_name;
+	u64 addr;
+
+	if (!btf_vmlinux) {
+		verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+		return -EINVAL;
+	}
+
+	if (insn[1].imm != 0) {
+		verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n");
+		return -EINVAL;
+	}
+
+	t = btf_type_by_id(btf_vmlinux, id);
+	if (!t) {
+		verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
+		return -ENOENT;
+	}
+
+	if (!btf_type_is_var(t)) {
+		verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n",
+			id);
+		return -EINVAL;
+	}
+
+	sym_name = btf_name_by_offset(btf_vmlinux, t->name_off);
+	addr = kallsyms_lookup_name(sym_name);
+	if (!addr) {
+		verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
+			sym_name);
+		return -ENOENT;
+	}
+
+	insn[0].imm = (u32)addr;
+	insn[1].imm = addr >> 32;
+
+	type = t->type;
+	t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
+	if (!btf_type_is_struct(t)) {
+		const struct btf_type *ret;
+		const char *tname;
+		u32 tsize;
+
+		/* resolve the type size of ksym. */
+		ret = btf_resolve_size(btf_vmlinux, t, &tsize);
+		if (IS_ERR(ret)) {
+			tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+			verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
+				tname, PTR_ERR(ret));
+			return -EINVAL;
+		}
+		aux->btf_var.reg_type = PTR_TO_MEM;
+		aux->btf_var.mem_size = tsize;
+	} else {
+		aux->btf_var.reg_type = PTR_TO_BTF_ID;
+		aux->btf_var.btf_id = type;
+	}
+	return 0;
+}
+
 static int check_map_prealloc(struct bpf_map *map)
 {
 	return (map->map_type != BPF_MAP_TYPE_HASH &&
@@ -9534,10 +9621,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
 		map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
 }
 
-/* look for pseudo eBPF instructions that access map FDs and
- * replace them with actual map pointers
+/* find and rewrite pseudo imm in ld_imm64 instructions:
+ *
+ * 1. if it accesses map FD, replace it with actual map pointer.
+ * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
+ *
+ * NOTE: btf_vmlinux is required for converting pseudo btf_id.
  */
-static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
+static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 {
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
@@ -9578,6 +9669,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 				/* valid generic load 64-bit imm */
 				goto next_insn;
 
+			if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
+				aux = &env->insn_aux_data[i];
+				err = check_pseudo_btf_id(env, insn, aux);
+				if (err)
+					return err;
+				goto next_insn;
+			}
+
 			/* In final convert_pseudo_ld_imm64() step, this is
 			 * converted into regular 64-bit imm load insn.
 			 */
@@ -11633,10 +11732,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
 
-	ret = replace_map_fd_with_map_ptr(env);
-	if (ret < 0)
-		goto skip_full_check;
-
 	if (bpf_prog_is_dev_bound(env->prog->aux)) {
 		ret = bpf_prog_offload_verifier_prep(env->prog);
 		if (ret)
@@ -11662,6 +11757,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (ret)
 		goto skip_full_check;
 
+	ret = resolve_pseudo_ldimm64(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	ret = check_cfg(env);
 	if (ret < 0)
 		goto skip_full_check;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4f556cfcbfbe..2aa156af24d6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -356,18 +356,36 @@ enum bpf_link_type {
 #define BPF_F_SLEEPABLE		(1U << 4)
 
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
- * two extensions:
+ * the following extensions:
  *
- * insn[0].src_reg:  BPF_PSEUDO_MAP_FD   BPF_PSEUDO_MAP_VALUE
- * insn[0].imm:      map fd              map fd
- * insn[1].imm:      0                   offset into value
- * insn[0].off:      0                   0
- * insn[1].off:      0                   0
- * ldimm64 rewrite:  address of map      address of map[0]+offset
- * verifier type:    CONST_PTR_TO_MAP    PTR_TO_MAP_VALUE
+ * insn[0].src_reg:  BPF_PSEUDO_MAP_FD
+ * insn[0].imm:      map fd
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of map
+ * verifier type:    CONST_PTR_TO_MAP
  */
 #define BPF_PSEUDO_MAP_FD	1
+/* insn[0].src_reg:  BPF_PSEUDO_MAP_VALUE
+ * insn[0].imm:      map fd
+ * insn[1].imm:      offset into value
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of map[0]+offset
+ * verifier type:    PTR_TO_MAP_VALUE
+ */
 #define BPF_PSEUDO_MAP_VALUE	2
+/* insn[0].src_reg:  BPF_PSEUDO_BTF_ID
+ * insn[0].imm:      kernel btd id of VAR
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the kernel variable
+ * verifier type:    PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var
+ *                   is struct/union.
+ */
+#define BPF_PSEUDO_BTF_ID	3
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function

From d370bbe1214381f444cbd9cdf3b8647223abb536 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Tue, 29 Sep 2020 16:50:45 -0700
Subject: [PATCH 06/62] bpf/libbpf: BTF support for typed ksyms

If a ksym is defined with a type, libbpf will try to find the ksym's btf
information from kernel btf. If a valid btf entry for the ksym is found,
libbpf can pass in the found btf id to the verifier, which validates the
ksym's type and value.

Typeless ksyms (i.e. those defined as 'void') will not have such btf_id,
but it has the symbol's address (read from kallsyms) and its value is
treated as a raw pointer.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200929235049.2533242-3-haoluo@google.com
---
 tools/lib/bpf/libbpf.c | 112 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 99 insertions(+), 13 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a4f55f8a460d..9b36c52b8511 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -390,6 +390,12 @@ struct extern_desc {
 		} kcfg;
 		struct {
 			unsigned long long addr;
+
+			/* target btf_id of the corresponding kernel var. */
+			int vmlinux_btf_id;
+
+			/* local btf_id of the ksym extern's type. */
+			__u32 type_id;
 		} ksym;
 	};
 };
@@ -2522,12 +2528,23 @@ static int bpf_object__load_vmlinux_btf(struct bpf_object *obj)
 {
 	bool need_vmlinux_btf = false;
 	struct bpf_program *prog;
-	int err;
+	int i, err;
 
 	/* CO-RE relocations need kernel BTF */
 	if (obj->btf_ext && obj->btf_ext->core_relo_info.len)
 		need_vmlinux_btf = true;
 
+	/* Support for typed ksyms needs kernel BTF */
+	for (i = 0; i < obj->nr_extern; i++) {
+		const struct extern_desc *ext;
+
+		ext = &obj->externs[i];
+		if (ext->type == EXT_KSYM && ext->ksym.type_id) {
+			need_vmlinux_btf = true;
+			break;
+		}
+	}
+
 	bpf_object__for_each_program(prog, obj) {
 		if (!prog->load)
 			continue;
@@ -3156,16 +3173,10 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
 				return -ENOTSUP;
 			}
 		} else if (strcmp(sec_name, KSYMS_SEC) == 0) {
-			const struct btf_type *vt;
-
 			ksym_sec = sec;
 			ext->type = EXT_KSYM;
-
-			vt = skip_mods_and_typedefs(obj->btf, t->type, NULL);
-			if (!btf_is_void(vt)) {
-				pr_warn("extern (ksym) '%s' is not typeless (void)\n", ext_name);
-				return -ENOTSUP;
-			}
+			skip_mods_and_typedefs(obj->btf, t->type,
+					       &ext->ksym.type_id);
 		} else {
 			pr_warn("unrecognized extern section '%s'\n", sec_name);
 			return -ENOTSUP;
@@ -5800,8 +5811,13 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
 				insn[0].imm = obj->maps[obj->kconfig_map_idx].fd;
 				insn[1].imm = ext->kcfg.data_off;
 			} else /* EXT_KSYM */ {
-				insn[0].imm = (__u32)ext->ksym.addr;
-				insn[1].imm = ext->ksym.addr >> 32;
+				if (ext->ksym.type_id) { /* typed ksyms */
+					insn[0].src_reg = BPF_PSEUDO_BTF_ID;
+					insn[0].imm = ext->ksym.vmlinux_btf_id;
+				} else { /* typeless ksyms */
+					insn[0].imm = (__u32)ext->ksym.addr;
+					insn[1].imm = ext->ksym.addr >> 32;
+				}
 			}
 			relo->processed = true;
 			break;
@@ -6933,10 +6949,72 @@ out:
 	return err;
 }
 
+static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj)
+{
+	struct extern_desc *ext;
+	int i, id;
+
+	for (i = 0; i < obj->nr_extern; i++) {
+		const struct btf_type *targ_var, *targ_type;
+		__u32 targ_type_id, local_type_id;
+		const char *targ_var_name;
+		int ret;
+
+		ext = &obj->externs[i];
+		if (ext->type != EXT_KSYM || !ext->ksym.type_id)
+			continue;
+
+		id = btf__find_by_name_kind(obj->btf_vmlinux, ext->name,
+					    BTF_KIND_VAR);
+		if (id <= 0) {
+			pr_warn("extern (ksym) '%s': failed to find BTF ID in vmlinux BTF.\n",
+				ext->name);
+			return -ESRCH;
+		}
+
+		/* find local type_id */
+		local_type_id = ext->ksym.type_id;
+
+		/* find target type_id */
+		targ_var = btf__type_by_id(obj->btf_vmlinux, id);
+		targ_var_name = btf__name_by_offset(obj->btf_vmlinux,
+						    targ_var->name_off);
+		targ_type = skip_mods_and_typedefs(obj->btf_vmlinux,
+						   targ_var->type,
+						   &targ_type_id);
+
+		ret = bpf_core_types_are_compat(obj->btf, local_type_id,
+						obj->btf_vmlinux, targ_type_id);
+		if (ret <= 0) {
+			const struct btf_type *local_type;
+			const char *targ_name, *local_name;
+
+			local_type = btf__type_by_id(obj->btf, local_type_id);
+			local_name = btf__name_by_offset(obj->btf,
+							 local_type->name_off);
+			targ_name = btf__name_by_offset(obj->btf_vmlinux,
+							targ_type->name_off);
+
+			pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n",
+				ext->name, local_type_id,
+				btf_kind_str(local_type), local_name, targ_type_id,
+				btf_kind_str(targ_type), targ_name);
+			return -EINVAL;
+		}
+
+		ext->is_set = true;
+		ext->ksym.vmlinux_btf_id = id;
+		pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n",
+			 ext->name, id, btf_kind_str(targ_var), targ_var_name);
+	}
+	return 0;
+}
+
 static int bpf_object__resolve_externs(struct bpf_object *obj,
 				       const char *extra_kconfig)
 {
 	bool need_config = false, need_kallsyms = false;
+	bool need_vmlinux_btf = false;
 	struct extern_desc *ext;
 	void *kcfg_data = NULL;
 	int err, i;
@@ -6967,7 +7045,10 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
 			   strncmp(ext->name, "CONFIG_", 7) == 0) {
 			need_config = true;
 		} else if (ext->type == EXT_KSYM) {
-			need_kallsyms = true;
+			if (ext->ksym.type_id)
+				need_vmlinux_btf = true;
+			else
+				need_kallsyms = true;
 		} else {
 			pr_warn("unrecognized extern '%s'\n", ext->name);
 			return -EINVAL;
@@ -6996,6 +7077,11 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
 		if (err)
 			return -EINVAL;
 	}
+	if (need_vmlinux_btf) {
+		err = bpf_object__resolve_ksyms_btf_id(obj);
+		if (err)
+			return -EINVAL;
+	}
 	for (i = 0; i < obj->nr_extern; i++) {
 		ext = &obj->externs[i];
 
@@ -7028,10 +7114,10 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 	}
 
 	err = bpf_object__probe_loading(obj);
+	err = err ? : bpf_object__load_vmlinux_btf(obj);
 	err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
 	err = err ? : bpf_object__sanitize_and_load_btf(obj);
 	err = err ? : bpf_object__sanitize_maps(obj);
-	err = err ? : bpf_object__load_vmlinux_btf(obj);
 	err = err ? : bpf_object__init_kern_struct_ops_maps(obj);
 	err = err ? : bpf_object__create_maps(obj);
 	err = err ? : bpf_object__relocate(obj, attr->target_btf_path);

From 2c2f6abeff1322a61f5ad7ce0ecccabf53e4fad5 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Tue, 29 Sep 2020 16:50:46 -0700
Subject: [PATCH 07/62] selftests/bpf: Ksyms_btf to test typed ksyms

Selftests for typed ksyms. Tests two types of ksyms: one is a struct,
the other is a plain int. This tests two paths in the kernel. Struct
ksyms will be converted into PTR_TO_BTF_ID by the verifier while int
typed ksyms will be converted into PTR_TO_MEM.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200929235049.2533242-4-haoluo@google.com
---
 .../testing/selftests/bpf/prog_tests/ksyms.c  | 38 ++++------
 .../selftests/bpf/prog_tests/ksyms_btf.c      | 70 +++++++++++++++++++
 .../selftests/bpf/progs/test_ksyms_btf.c      | 23 ++++++
 tools/testing/selftests/bpf/trace_helpers.c   | 27 +++++++
 tools/testing/selftests/bpf/trace_helpers.h   |  4 ++
 5 files changed, 137 insertions(+), 25 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_ksyms_btf.c

diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c
index b771804b2342..b295969b263b 100644
--- a/tools/testing/selftests/bpf/prog_tests/ksyms.c
+++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c
@@ -7,40 +7,28 @@
 
 static int duration;
 
-static __u64 kallsyms_find(const char *sym)
-{
-	char type, name[500];
-	__u64 addr, res = 0;
-	FILE *f;
-
-	f = fopen("/proc/kallsyms", "r");
-	if (CHECK(!f, "kallsyms_fopen", "failed to open: %d\n", errno))
-		return 0;
-
-	while (fscanf(f, "%llx %c %499s%*[^\n]\n", &addr, &type, name) > 0) {
-		if (strcmp(name, sym) == 0) {
-			res = addr;
-			goto out;
-		}
-	}
-
-	CHECK(false, "not_found", "symbol %s not found\n", sym);
-out:
-	fclose(f);
-	return res;
-}
-
 void test_ksyms(void)
 {
-	__u64 per_cpu_start_addr = kallsyms_find("__per_cpu_start");
-	__u64 link_fops_addr = kallsyms_find("bpf_link_fops");
 	const char *btf_path = "/sys/kernel/btf/vmlinux";
 	struct test_ksyms *skel;
 	struct test_ksyms__data *data;
+	__u64 link_fops_addr, per_cpu_start_addr;
 	struct stat st;
 	__u64 btf_size;
 	int err;
 
+	err = kallsyms_find("bpf_link_fops", &link_fops_addr);
+	if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+		return;
+	if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n"))
+		return;
+
+	err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr);
+	if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+		return;
+	if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n"))
+		return;
+
 	if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno))
 		return;
 	btf_size = st.st_size;
diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
new file mode 100644
index 000000000000..c6ef06c0629a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Google */
+
+#include <test_progs.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+#include "test_ksyms_btf.skel.h"
+
+static int duration;
+
+void test_ksyms_btf(void)
+{
+	__u64 runqueues_addr, bpf_prog_active_addr;
+	struct test_ksyms_btf *skel = NULL;
+	struct test_ksyms_btf__data *data;
+	struct btf *btf;
+	int percpu_datasec;
+	int err;
+
+	err = kallsyms_find("runqueues", &runqueues_addr);
+	if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+		return;
+	if (CHECK(err == -ENOENT, "ksym_find", "symbol 'runqueues' not found\n"))
+		return;
+
+	err = kallsyms_find("bpf_prog_active", &bpf_prog_active_addr);
+	if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+		return;
+	if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_prog_active' not found\n"))
+		return;
+
+	btf = libbpf_find_kernel_btf();
+	if (CHECK(IS_ERR(btf), "btf_exists", "failed to load kernel BTF: %ld\n",
+		  PTR_ERR(btf)))
+		return;
+
+	percpu_datasec = btf__find_by_name_kind(btf, ".data..percpu",
+						BTF_KIND_DATASEC);
+	if (percpu_datasec < 0) {
+		printf("%s:SKIP:no PERCPU DATASEC in kernel btf\n",
+		       __func__);
+		test__skip();
+		goto cleanup;
+	}
+
+	skel = test_ksyms_btf__open_and_load();
+	if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n"))
+		goto cleanup;
+
+	err = test_ksyms_btf__attach(skel);
+	if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+		goto cleanup;
+
+	/* trigger tracepoint */
+	usleep(1);
+
+	data = skel->data;
+	CHECK(data->out__runqueues_addr != runqueues_addr, "runqueues_addr",
+	      "got %llu, exp %llu\n",
+	      (unsigned long long)data->out__runqueues_addr,
+	      (unsigned long long)runqueues_addr);
+	CHECK(data->out__bpf_prog_active_addr != bpf_prog_active_addr, "bpf_prog_active_addr",
+	      "got %llu, exp %llu\n",
+	      (unsigned long long)data->out__bpf_prog_active_addr,
+	      (unsigned long long)bpf_prog_active_addr);
+
+cleanup:
+	btf__free(btf);
+	test_ksyms_btf__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf.c
new file mode 100644
index 000000000000..7dde2082131d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Google */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+
+__u64 out__runqueues_addr = -1;
+__u64 out__bpf_prog_active_addr = -1;
+
+extern const struct rq runqueues __ksym; /* struct type global var. */
+extern const int bpf_prog_active __ksym; /* int type global var. */
+
+SEC("raw_tp/sys_enter")
+int handler(const void *ctx)
+{
+	out__runqueues_addr = (__u64)&runqueues;
+	out__bpf_prog_active_addr = (__u64)&bpf_prog_active;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 4d0e913bbb22..1bbd1d9830c8 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -90,6 +90,33 @@ long ksym_get_addr(const char *name)
 	return 0;
 }
 
+/* open kallsyms and read symbol addresses on the fly. Without caching all symbols,
+ * this is faster than load + find.
+ */
+int kallsyms_find(const char *sym, unsigned long long *addr)
+{
+	char type, name[500];
+	unsigned long long value;
+	int err = 0;
+	FILE *f;
+
+	f = fopen("/proc/kallsyms", "r");
+	if (!f)
+		return -EINVAL;
+
+	while (fscanf(f, "%llx %c %499s%*[^\n]\n", &value, &type, name) > 0) {
+		if (strcmp(name, sym) == 0) {
+			*addr = value;
+			goto out;
+		}
+	}
+	err = -ENOENT;
+
+out:
+	fclose(f);
+	return err;
+}
+
 void read_trace_pipe(void)
 {
 	int trace_fd;
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 25ef597dd03f..f62fdef9e589 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -12,6 +12,10 @@ struct ksym {
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
 long ksym_get_addr(const char *name);
+
+/* open kallsyms and find addresses on the fly, faster than load + search. */
+int kallsyms_find(const char *sym, unsigned long long *addr);
+
 void read_trace_pipe(void);
 
 #endif

From eaa6bcb71ef6ed3dc18fc525ee7e293b06b4882b Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Tue, 29 Sep 2020 16:50:47 -0700
Subject: [PATCH 08/62] bpf: Introduce bpf_per_cpu_ptr()

Add bpf_per_cpu_ptr() to help bpf programs access percpu vars.
bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the kernel
except that it may return NULL. This happens when the cpu parameter is
out of range. So the caller must check the returned value.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200929235049.2533242-5-haoluo@google.com
---
 include/linux/bpf.h            |  4 +++
 include/linux/btf.h            | 11 ++++++
 include/uapi/linux/bpf.h       | 18 ++++++++++
 kernel/bpf/btf.c               | 10 ------
 kernel/bpf/helpers.c           | 18 ++++++++++
 kernel/bpf/verifier.c          | 64 ++++++++++++++++++++++++++++++++--
 kernel/trace/bpf_trace.c       |  2 ++
 tools/include/uapi/linux/bpf.h | 18 ++++++++++
 8 files changed, 132 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 50e5c4b52bd1..9dde15b2479d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -293,6 +293,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_ALLOC_MEM_OR_NULL,	/* pointer to dynamically allocated memory or NULL */
 	ARG_CONST_ALLOC_SIZE_OR_ZERO,	/* number of allocated bytes requested */
 	ARG_PTR_TO_BTF_ID_SOCK_COMMON,	/* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
+	ARG_PTR_TO_PERCPU_BTF_ID,	/* pointer to in-kernel percpu type */
 	__BPF_ARG_TYPE_MAX,
 };
 
@@ -307,6 +308,7 @@ enum bpf_return_type {
 	RET_PTR_TO_SOCK_COMMON_OR_NULL,	/* returns a pointer to a sock_common or NULL */
 	RET_PTR_TO_ALLOC_MEM_OR_NULL,	/* returns a pointer to dynamically allocated memory or NULL */
 	RET_PTR_TO_BTF_ID_OR_NULL,	/* returns a pointer to a btf_id or NULL */
+	RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -405,6 +407,7 @@ enum bpf_reg_type {
 	PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
 	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */
 	PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
+	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -1828,6 +1831,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_proto;
 extern const struct bpf_func_proto bpf_snprintf_btf_proto;
+extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/linux/btf.h b/include/linux/btf.h
index af1244180588..2bf641829664 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -110,6 +110,11 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type,
 	     i < btf_type_vlen(struct_type);			\
 	     i++, member++)
 
+#define for_each_vsi(i, datasec_type, member)			\
+	for (i = 0, member = btf_type_var_secinfo(datasec_type);	\
+	     i < btf_type_vlen(datasec_type);			\
+	     i++, member++)
+
 static inline bool btf_type_is_ptr(const struct btf_type *t)
 {
 	return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
@@ -194,6 +199,12 @@ static inline const struct btf_member *btf_type_member(const struct btf_type *t)
 	return (const struct btf_member *)(t + 1);
 }
 
+static inline const struct btf_var_secinfo *btf_type_var_secinfo(
+		const struct btf_type *t)
+{
+	return (const struct btf_var_secinfo *)(t + 1);
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 const char *btf_name_by_offset(const struct btf *btf, u32 offset);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2aa156af24d6..f3c1b637ab39 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3686,6 +3686,23 @@ union bpf_attr {
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
+ *
+ * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu)
+ *     Description
+ *             Take a pointer to a percpu ksym, *percpu_ptr*, and return a
+ *             pointer to the percpu kernel variable on *cpu*. A ksym is an
+ *             extern variable decorated with '__ksym'. For ksym, there is a
+ *             global var (either static or global) defined of the same name
+ *             in the kernel. The ksym is percpu if the global var is percpu.
+ *             The returned pointer points to the global percpu var on *cpu*.
+ *
+ *             bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the
+ *             kernel, except that bpf_per_cpu_ptr() may return NULL. This
+ *             happens if *cpu* is larger than nr_cpu_ids. The caller of
+ *             bpf_per_cpu_ptr() must check the returned value.
+ *     Return
+ *             A pointer pointing to the kernel percpu variable on *cpu*, or
+ *             NULL, if *cpu* is invalid.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3841,6 +3858,7 @@ union bpf_attr {
 	FN(seq_printf_btf),		\
 	FN(skb_cgroup_classid),		\
 	FN(redirect_neigh),		\
+	FN(bpf_per_cpu_ptr),            \
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 00569afe3d0d..ed7d02e8bc93 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -188,11 +188,6 @@
 	     i < btf_type_vlen(struct_type);				\
 	     i++, member++)
 
-#define for_each_vsi(i, struct_type, member)			\
-	for (i = 0, member = btf_type_var_secinfo(struct_type);	\
-	     i < btf_type_vlen(struct_type);			\
-	     i++, member++)
-
 #define for_each_vsi_from(i, from, struct_type, member)				\
 	for (i = from, member = btf_type_var_secinfo(struct_type) + from;	\
 	     i < btf_type_vlen(struct_type);					\
@@ -598,11 +593,6 @@ static const struct btf_var *btf_type_var(const struct btf_type *t)
 	return (const struct btf_var *)(t + 1);
 }
 
-static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t)
-{
-	return (const struct btf_var_secinfo *)(t + 1);
-}
-
 static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 {
 	return kind_ops[BTF_INFO_KIND(t->info)];
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e825441781ab..14fe3f64fd82 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -623,6 +623,22 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
+{
+	if (cpu >= nr_cpu_ids)
+		return (unsigned long)NULL;
+
+	return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
+}
+
+const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
+	.func		= bpf_per_cpu_ptr,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -689,6 +705,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_snprintf_btf_proto;
 	case BPF_FUNC_jiffies64:
 		return &bpf_jiffies64_proto;
+	case BPF_FUNC_bpf_per_cpu_ptr:
+		return &bpf_per_cpu_ptr_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fe4965079773..216b8ece23ce 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,8 @@ struct bpf_call_arg_meta {
 	u64 msize_max_value;
 	int ref_obj_id;
 	int func_id;
+	u32 btf_id;
+	u32 ret_btf_id;
 };
 
 struct btf *btf_vmlinux;
@@ -517,6 +519,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_XDP_SOCK]	= "xdp_sock",
 	[PTR_TO_BTF_ID]		= "ptr_",
 	[PTR_TO_BTF_ID_OR_NULL]	= "ptr_or_null_",
+	[PTR_TO_PERCPU_BTF_ID]	= "percpu_ptr_",
 	[PTR_TO_MEM]		= "mem",
 	[PTR_TO_MEM_OR_NULL]	= "mem_or_null",
 	[PTR_TO_RDONLY_BUF]	= "rdonly_buf",
@@ -583,7 +586,9 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			/* reg->off should be 0 for SCALAR_VALUE */
 			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
-			if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
+			if (t == PTR_TO_BTF_ID ||
+			    t == PTR_TO_BTF_ID_OR_NULL ||
+			    t == PTR_TO_PERCPU_BTF_ID)
 				verbose(env, "%s", kernel_type_name(reg->btf_id));
 			verbose(env, "(id=%d", reg->id);
 			if (reg_type_may_be_refcounted_or_null(t))
@@ -2204,6 +2209,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_RDONLY_BUF_OR_NULL:
 	case PTR_TO_RDWR_BUF:
 	case PTR_TO_RDWR_BUF_OR_NULL:
+	case PTR_TO_PERCPU_BTF_ID:
 		return true;
 	default:
 		return false;
@@ -4017,6 +4023,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } };
 static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
 static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
 static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -4042,6 +4049,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_ALLOC_MEM_OR_NULL]	= &alloc_mem_types,
 	[ARG_PTR_TO_INT]		= &int_ptr_types,
 	[ARG_PTR_TO_LONG]		= &int_ptr_types,
+	[ARG_PTR_TO_PERCPU_BTF_ID]	= &percpu_btf_ptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4205,6 +4213,12 @@ skip_type_check:
 		err = check_helper_mem_access(env, regno,
 					      meta->map_ptr->value_size, false,
 					      meta);
+	} else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) {
+		if (!reg->btf_id) {
+			verbose(env, "Helper has invalid btf_id in R%d\n", regno);
+			return -EACCES;
+		}
+		meta->ret_btf_id = reg->btf_id;
 	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
 		if (meta->func_id == BPF_FUNC_spin_lock) {
 			if (process_spin_lock(env, regno, true))
@@ -5114,6 +5128,30 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
 		regs[BPF_REG_0].id = ++env->id_gen;
 		regs[BPF_REG_0].mem_size = meta.mem_size;
+	} else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL) {
+		const struct btf_type *t;
+
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL);
+		if (!btf_type_is_struct(t)) {
+			u32 tsize;
+			const struct btf_type *ret;
+			const char *tname;
+
+			/* resolve the type size of ksym. */
+			ret = btf_resolve_size(btf_vmlinux, t, &tsize);
+			if (IS_ERR(ret)) {
+				tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+				verbose(env, "unable to resolve the size of type '%s': %ld\n",
+					tname, PTR_ERR(ret));
+				return -EINVAL;
+			}
+			regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
+			regs[BPF_REG_0].mem_size = tsize;
+		} else {
+			regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL;
+			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+		}
 	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
 		int ret_btf_id;
 
@@ -7523,6 +7561,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			dst_reg->mem_size = aux->btf_var.mem_size;
 			break;
 		case PTR_TO_BTF_ID:
+		case PTR_TO_PERCPU_BTF_ID:
 			dst_reg->btf_id = aux->btf_var.btf_id;
 			break;
 		default:
@@ -9449,10 +9488,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 			       struct bpf_insn *insn,
 			       struct bpf_insn_aux_data *aux)
 {
-	u32 type, id = insn->imm;
+	u32 datasec_id, type, id = insn->imm;
+	const struct btf_var_secinfo *vsi;
+	const struct btf_type *datasec;
 	const struct btf_type *t;
 	const char *sym_name;
+	bool percpu = false;
 	u64 addr;
+	int i;
 
 	if (!btf_vmlinux) {
 		verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
@@ -9484,12 +9527,27 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 		return -ENOENT;
 	}
 
+	datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu",
+					   BTF_KIND_DATASEC);
+	if (datasec_id > 0) {
+		datasec = btf_type_by_id(btf_vmlinux, datasec_id);
+		for_each_vsi(i, datasec, vsi) {
+			if (vsi->type == id) {
+				percpu = true;
+				break;
+			}
+		}
+	}
+
 	insn[0].imm = (u32)addr;
 	insn[1].imm = addr >> 32;
 
 	type = t->type;
 	t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
-	if (!btf_type_is_struct(t)) {
+	if (percpu) {
+		aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
+		aux->btf_var.btf_id = type;
+	} else if (!btf_type_is_struct(t)) {
 		const struct btf_type *ret;
 		const char *tname;
 		u32 tsize;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e118a83439c3..364a322e2898 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1327,6 +1327,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
 	case BPF_FUNC_snprintf_btf:
 		return &bpf_snprintf_btf_proto;
+	case BPF_FUNC_bpf_per_cpu_ptr:
+		return &bpf_per_cpu_ptr_proto;
 	default:
 		return NULL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2aa156af24d6..f3c1b637ab39 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3686,6 +3686,23 @@ union bpf_attr {
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
+ *
+ * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu)
+ *     Description
+ *             Take a pointer to a percpu ksym, *percpu_ptr*, and return a
+ *             pointer to the percpu kernel variable on *cpu*. A ksym is an
+ *             extern variable decorated with '__ksym'. For ksym, there is a
+ *             global var (either static or global) defined of the same name
+ *             in the kernel. The ksym is percpu if the global var is percpu.
+ *             The returned pointer points to the global percpu var on *cpu*.
+ *
+ *             bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the
+ *             kernel, except that bpf_per_cpu_ptr() may return NULL. This
+ *             happens if *cpu* is larger than nr_cpu_ids. The caller of
+ *             bpf_per_cpu_ptr() must check the returned value.
+ *     Return
+ *             A pointer pointing to the kernel percpu variable on *cpu*, or
+ *             NULL, if *cpu* is invalid.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3841,6 +3858,7 @@ union bpf_attr {
 	FN(seq_printf_btf),		\
 	FN(skb_cgroup_classid),		\
 	FN(redirect_neigh),		\
+	FN(bpf_per_cpu_ptr),            \
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper

From 63d9b80dcf2c67bc5ade61cbbaa09d7af21f43f1 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Tue, 29 Sep 2020 16:50:48 -0700
Subject: [PATCH 09/62] bpf: Introducte bpf_this_cpu_ptr()

Add bpf_this_cpu_ptr() to help access percpu var on this cpu. This
helper always returns a valid pointer, therefore no need to check
returned value for NULL. Also note that all programs run with
preemption disabled, which means that the returned pointer is stable
during all the execution of the program.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200929235049.2533242-6-haoluo@google.com
---
 include/linux/bpf.h            |  2 ++
 include/uapi/linux/bpf.h       | 13 +++++++++++++
 kernel/bpf/helpers.c           | 14 ++++++++++++++
 kernel/bpf/verifier.c          | 11 ++++++++---
 kernel/trace/bpf_trace.c       |  2 ++
 tools/include/uapi/linux/bpf.h | 13 +++++++++++++
 6 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9dde15b2479d..dc63eeed4fd9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -309,6 +309,7 @@ enum bpf_return_type {
 	RET_PTR_TO_ALLOC_MEM_OR_NULL,	/* returns a pointer to dynamically allocated memory or NULL */
 	RET_PTR_TO_BTF_ID_OR_NULL,	/* returns a pointer to a btf_id or NULL */
 	RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
+	RET_PTR_TO_MEM_OR_BTF_ID,	/* returns a pointer to a valid memory or a btf_id */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -1832,6 +1833,7 @@ extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_proto;
 extern const struct bpf_func_proto bpf_snprintf_btf_proto;
 extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
+extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f3c1b637ab39..c446394135be 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3703,6 +3703,18 @@ union bpf_attr {
  *     Return
  *             A pointer pointing to the kernel percpu variable on *cpu*, or
  *             NULL, if *cpu* is invalid.
+ *
+ * void *bpf_this_cpu_ptr(const void *percpu_ptr)
+ *	Description
+ *		Take a pointer to a percpu ksym, *percpu_ptr*, and return a
+ *		pointer to the percpu kernel variable on this cpu. See the
+ *		description of 'ksym' in **bpf_per_cpu_ptr**\ ().
+ *
+ *		bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in
+ *		the kernel. Different from **bpf_per_cpu_ptr**\ (), it would
+ *		never return NULL.
+ *	Return
+ *		A pointer pointing to the kernel percpu variable on this cpu.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3859,6 +3871,7 @@ union bpf_attr {
 	FN(skb_cgroup_classid),		\
 	FN(redirect_neigh),		\
 	FN(bpf_per_cpu_ptr),            \
+	FN(bpf_this_cpu_ptr),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 14fe3f64fd82..25520f5eeaf6 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -639,6 +639,18 @@ const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
+{
+	return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
+}
+
+const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
+	.func		= bpf_this_cpu_ptr,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID,
+	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -707,6 +719,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_jiffies64_proto;
 	case BPF_FUNC_bpf_per_cpu_ptr:
 		return &bpf_per_cpu_ptr_proto;
+	case BPF_FUNC_bpf_this_cpu_ptr:
+		return &bpf_this_cpu_ptr_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 216b8ece23ce..d9dbf271ebab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5128,7 +5128,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
 		regs[BPF_REG_0].id = ++env->id_gen;
 		regs[BPF_REG_0].mem_size = meta.mem_size;
-	} else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL) {
+	} else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
+		   fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
 		const struct btf_type *t;
 
 		mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -5146,10 +5147,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 					tname, PTR_ERR(ret));
 				return -EINVAL;
 			}
-			regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
+			regs[BPF_REG_0].type =
+				fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
+				PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
 			regs[BPF_REG_0].mem_size = tsize;
 		} else {
-			regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL;
+			regs[BPF_REG_0].type =
+				fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
+				PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
 	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 364a322e2898..a136a6a63a71 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1329,6 +1329,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_snprintf_btf_proto;
 	case BPF_FUNC_bpf_per_cpu_ptr:
 		return &bpf_per_cpu_ptr_proto;
+	case BPF_FUNC_bpf_this_cpu_ptr:
+		return &bpf_this_cpu_ptr_proto;
 	default:
 		return NULL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f3c1b637ab39..c446394135be 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3703,6 +3703,18 @@ union bpf_attr {
  *     Return
  *             A pointer pointing to the kernel percpu variable on *cpu*, or
  *             NULL, if *cpu* is invalid.
+ *
+ * void *bpf_this_cpu_ptr(const void *percpu_ptr)
+ *	Description
+ *		Take a pointer to a percpu ksym, *percpu_ptr*, and return a
+ *		pointer to the percpu kernel variable on this cpu. See the
+ *		description of 'ksym' in **bpf_per_cpu_ptr**\ ().
+ *
+ *		bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in
+ *		the kernel. Different from **bpf_per_cpu_ptr**\ (), it would
+ *		never return NULL.
+ *	Return
+ *		A pointer pointing to the kernel percpu variable on this cpu.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3859,6 +3871,7 @@ union bpf_attr {
 	FN(skb_cgroup_classid),		\
 	FN(redirect_neigh),		\
 	FN(bpf_per_cpu_ptr),            \
+	FN(bpf_this_cpu_ptr),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper

From 00dc73e44a846fc5310df0e1415a90af76cc135e Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Tue, 29 Sep 2020 16:50:49 -0700
Subject: [PATCH 10/62] bpf/selftests: Test for bpf_per_cpu_ptr() and
 bpf_this_cpu_ptr()

Test bpf_per_cpu_ptr() and bpf_this_cpu_ptr(). Test two paths in the
kernel. If the base pointer points to a struct, the returned reg is
of type PTR_TO_BTF_ID. Direct pointer dereference can be applied on
the returned variable. If the base pointer isn't a struct, the
returned reg is of type PTR_TO_MEM, which also supports direct pointer
dereference.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200929235049.2533242-7-haoluo@google.com
---
 .../selftests/bpf/prog_tests/ksyms_btf.c      | 18 +++++++++++
 .../selftests/bpf/progs/test_ksyms_btf.c      | 32 +++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
index c6ef06c0629a..28e26bd3e0ca 100644
--- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
@@ -11,6 +11,8 @@ static int duration;
 void test_ksyms_btf(void)
 {
 	__u64 runqueues_addr, bpf_prog_active_addr;
+	__u32 this_rq_cpu;
+	int this_bpf_prog_active;
 	struct test_ksyms_btf *skel = NULL;
 	struct test_ksyms_btf__data *data;
 	struct btf *btf;
@@ -64,6 +66,22 @@ void test_ksyms_btf(void)
 	      (unsigned long long)data->out__bpf_prog_active_addr,
 	      (unsigned long long)bpf_prog_active_addr);
 
+	CHECK(data->out__rq_cpu == -1, "rq_cpu",
+	      "got %u, exp != -1\n", data->out__rq_cpu);
+	CHECK(data->out__bpf_prog_active < 0, "bpf_prog_active",
+	      "got %d, exp >= 0\n", data->out__bpf_prog_active);
+	CHECK(data->out__cpu_0_rq_cpu != 0, "cpu_rq(0)->cpu",
+	      "got %u, exp 0\n", data->out__cpu_0_rq_cpu);
+
+	this_rq_cpu = data->out__this_rq_cpu;
+	CHECK(this_rq_cpu != data->out__rq_cpu, "this_rq_cpu",
+	      "got %u, exp %u\n", this_rq_cpu, data->out__rq_cpu);
+
+	this_bpf_prog_active = data->out__this_bpf_prog_active;
+	CHECK(this_bpf_prog_active != data->out__bpf_prog_active, "this_bpf_prog_active",
+	      "got %d, exp %d\n", this_bpf_prog_active,
+	      data->out__bpf_prog_active);
+
 cleanup:
 	btf__free(btf);
 	test_ksyms_btf__destroy(skel);
diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf.c
index 7dde2082131d..bb8ea9270f29 100644
--- a/tools/testing/selftests/bpf/progs/test_ksyms_btf.c
+++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf.c
@@ -8,15 +8,47 @@
 __u64 out__runqueues_addr = -1;
 __u64 out__bpf_prog_active_addr = -1;
 
+__u32 out__rq_cpu = -1; /* percpu struct fields */
+int out__bpf_prog_active = -1; /* percpu int */
+
+__u32 out__this_rq_cpu = -1;
+int out__this_bpf_prog_active = -1;
+
+__u32 out__cpu_0_rq_cpu = -1; /* cpu_rq(0)->cpu */
+
 extern const struct rq runqueues __ksym; /* struct type global var. */
 extern const int bpf_prog_active __ksym; /* int type global var. */
 
 SEC("raw_tp/sys_enter")
 int handler(const void *ctx)
 {
+	struct rq *rq;
+	int *active;
+	__u32 cpu;
+
 	out__runqueues_addr = (__u64)&runqueues;
 	out__bpf_prog_active_addr = (__u64)&bpf_prog_active;
 
+	cpu = bpf_get_smp_processor_id();
+
+	/* test bpf_per_cpu_ptr() */
+	rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, cpu);
+	if (rq)
+		out__rq_cpu = rq->cpu;
+	active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
+	if (active)
+		out__bpf_prog_active = *active;
+
+	rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0);
+	if (rq) /* should always be valid, but we can't spare the check. */
+		out__cpu_0_rq_cpu = rq->cpu;
+
+	/* test bpf_this_cpu_ptr */
+	rq = (struct rq *)bpf_this_cpu_ptr(&runqueues);
+	out__this_rq_cpu = rq->cpu;
+	active = (int *)bpf_this_cpu_ptr(&bpf_prog_active);
+	out__this_bpf_prog_active = *active;
+
 	return 0;
 }
 

From 18ebe16d10496db795a9930c320c45653207a548 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 1 Oct 2020 18:09:52 -0700
Subject: [PATCH 11/62] bpf, sockmap: Add skb_adjust_room to pop bytes off
 ingress payload

This implements a new helper skb_adjust_room() so users can push/pop
extra bytes from a BPF_SK_SKB_STREAM_VERDICT program.

Some protocols may include headers and other information that we may
not want to include when doing a redirect from a BPF_SK_SKB_STREAM_VERDICT
program. One use case is to redirect TLS packets into a receive socket
that doesn't expect TLS data. In TLS case the first 13B or so contain the
protocol header. With KTLS the payload is decrypted so we should be able
to redirect this to a receiving socket, but the receiving socket may not
be expecting to receive a TLS header and discard the data. Using the
above helper we can pop the header off and put an appropriate header on
the payload. This allows for creating a proxy between protocols without
extra hops through the stack or userspace.

So in order to fix this case add skb_adjust_room() so users can strip the
header. After this the user can strip the header and an unmodified receiver
thread will work correctly when data is redirected into the ingress path
of a sock.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160160099197.7052.8443193973242831692.stgit@john-Precision-5820-Tower
---
 net/core/filter.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 3fb6adad1957..05df73780dd3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -76,6 +76,7 @@
 #include <net/bpf_sk_storage.h>
 #include <net/transp_v6.h>
 #include <linux/btf_ids.h>
+#include <net/tls.h>
 
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -3479,6 +3480,48 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb)
 			  SKB_MAX_ALLOC;
 }
 
+BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+	   u32, mode, u64, flags)
+{
+	u32 len_diff_abs = abs(len_diff);
+	bool shrink = len_diff < 0;
+	int ret = 0;
+
+	if (unlikely(flags || mode))
+		return -EINVAL;
+	if (unlikely(len_diff_abs > 0xfffU))
+		return -EFAULT;
+
+	if (!shrink) {
+		ret = skb_cow(skb, len_diff);
+		if (unlikely(ret < 0))
+			return ret;
+		__skb_push(skb, len_diff_abs);
+		memset(skb->data, 0, len_diff_abs);
+	} else {
+		if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
+			return -ENOMEM;
+		__skb_pull(skb, len_diff_abs);
+	}
+	bpf_compute_data_end_sk_skb(skb);
+	if (tls_sw_has_ctx_rx(skb->sk)) {
+		struct strp_msg *rxm = strp_msg(skb);
+
+		rxm->full_len += len_diff;
+	}
+	return ret;
+}
+
+static const struct bpf_func_proto sk_skb_adjust_room_proto = {
+	.func		= sk_skb_adjust_room,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
 	   u32, mode, u64, flags)
 {
@@ -6745,6 +6788,7 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_skb_change_tail ||
 	    func == sk_skb_change_tail ||
 	    func == bpf_skb_adjust_room ||
+	    func == sk_skb_adjust_room ||
 	    func == bpf_skb_pull_data ||
 	    func == sk_skb_pull_data ||
 	    func == bpf_clone_redirect ||
@@ -7218,6 +7262,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &sk_skb_change_tail_proto;
 	case BPF_FUNC_skb_change_head:
 		return &sk_skb_change_head_proto;
+	case BPF_FUNC_skb_adjust_room:
+		return &sk_skb_adjust_room_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:

From 91274ca535185813cd5eebfe5a4d3344750c74e3 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 1 Oct 2020 18:10:09 -0700
Subject: [PATCH 12/62] bpf, sockmap: Update selftests to use skb_adjust_room

Instead of working around TLS headers in sockmap selftests use the
new skb_adjust_room helper. This allows us to avoid special casing
the receive side to skip headers.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160160100932.7052.3646935243867660528.stgit@john-Precision-5820-Tower
---
 .../selftests/bpf/progs/test_sockmap_kern.h   | 34 ++++++++++++++-----
 tools/testing/selftests/bpf/test_sockmap.c    | 27 ++++-----------
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
index 3dca4c2e2418..1858435de7aa 100644
--- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
@@ -131,39 +131,55 @@ int bpf_prog2(struct __sk_buff *skb)
 
 }
 
-SEC("sk_skb3")
-int bpf_prog3(struct __sk_buff *skb)
+static inline void bpf_write_pass(struct __sk_buff *skb, int offset)
 {
-	const int one = 1;
-	int err, *f, ret = SK_PASS;
+	int err = bpf_skb_pull_data(skb, 6 + offset);
 	void *data_end;
 	char *c;
 
-	err = bpf_skb_pull_data(skb, 19);
 	if (err)
-		goto tls_out;
+		return;
 
 	c = (char *)(long)skb->data;
 	data_end = (void *)(long)skb->data_end;
 
-	if (c + 18 < data_end)
-		memcpy(&c[13], "PASS", 4);
+	if (c + 5 + offset < data_end)
+		memcpy(c + offset, "PASS", 4);
+}
+
+SEC("sk_skb3")
+int bpf_prog3(struct __sk_buff *skb)
+{
+	int err, *f, ret = SK_PASS;
+	const int one = 1;
+
 	f = bpf_map_lookup_elem(&sock_skb_opts, &one);
 	if (f && *f) {
 		__u64 flags = 0;
 
 		ret = 0;
 		flags = *f;
+
+		err = bpf_skb_adjust_room(skb, -13, 0, 0);
+		if (err)
+			return SK_DROP;
+		err = bpf_skb_adjust_room(skb, 4, 0, 0);
+		if (err)
+			return SK_DROP;
+		bpf_write_pass(skb, 0);
 #ifdef SOCKMAP
 		return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags);
 #else
 		return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags);
 #endif
 	}
-
 	f = bpf_map_lookup_elem(&sock_skb_opts, &one);
 	if (f && *f)
 		ret = SK_DROP;
+	err = bpf_skb_adjust_room(skb, 4, 0, 0);
+	if (err)
+		return SK_DROP;
+	bpf_write_pass(skb, 13);
 tls_out:
 	return ret;
 }
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 9b6fb00dc7a0..5cf45455de42 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -518,28 +518,13 @@ static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
 		if (i == 0 && txmsg_ktls_skb) {
 			if (msg->msg_iov[i].iov_len < 4)
 				return -EIO;
-			if (txmsg_ktls_skb_redir) {
-				if (memcmp(&d[13], "PASS", 4) != 0) {
-					fprintf(stderr,
-						"detected redirect ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[13], d[14], d[15], d[16]);
-					return -EIO;
-				}
-				d[13] = 0;
-				d[14] = 1;
-				d[15] = 2;
-				d[16] = 3;
-				j = 13;
-			} else if (txmsg_ktls_skb) {
-				if (memcmp(d, "PASS", 4) != 0) {
-					fprintf(stderr,
-						"detected ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[0], d[1], d[2], d[3]);
-					return -EIO;
-				}
-				d[0] = 0;
-				d[1] = 1;
-				d[2] = 2;
-				d[3] = 3;
+			if (memcmp(d, "PASS", 4) != 0) {
+				fprintf(stderr,
+					"detected skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n",
+					i, 0, d[0], d[1], d[2], d[3]);
+				return -EIO;
 			}
+			j = 4; /* advance index past PASS header */
 		}
 
 		for (; j < msg->msg_iov[i].iov_len && size; j++) {

From 1028ae4069991e26d1522e957939fb61d2da1d12 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 2 Oct 2020 17:25:44 -0700
Subject: [PATCH 13/62] bpf: Deref map in BPF_PROG_BIND_MAP when it's already
 used

We are missing a deref for the case when we are doing BPF_PROG_BIND_MAP
on a map that's being already held by the program.
There is 'if (ret) bpf_map_put(map)' below which doesn't trigger
because we don't consider this an error.
Let's add missing bpf_map_put() for this specific condition.

Fixes: ef15314aa5de ("bpf: Add BPF_PROG_BIND_MAP syscall")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20201003002544.3601440-1-sdf@google.com
---
 kernel/bpf/syscall.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f1528c2a6927..1110ecd7d1f3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4323,8 +4323,10 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
 	used_maps_old = prog->aux->used_maps;
 
 	for (i = 0; i < prog->aux->used_map_cnt; i++)
-		if (used_maps_old[i] == map)
+		if (used_maps_old[i] == map) {
+			bpf_map_put(map);
 			goto out_unlock;
+		}
 
 	used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
 				      sizeof(used_maps_new[0]),

From b75597d8947f842fa525000cf9569c21e7596da9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= <bjorn.topel@intel.com>
Date: Mon, 5 Oct 2020 11:05:25 +0200
Subject: [PATCH 14/62] xsk: Remove internal DMA headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Christoph Hellwig correctly pointed out [1] that the AF_XDP core was
pointlessly including internal headers. Let us remove those includes.

  [1] https://lore.kernel.org/bpf/20201005084341.GA3224@infradead.org/

Fixes: 1c1efc2af158 ("xsk: Create and free buffer pool independently from umem")
Reported-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/bpf/20201005090525.116689-1-bjorn.topel@gmail.com
---
 net/xdp/xsk_buff_pool.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index e63fadd000db..64c9e55d4d4e 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -3,9 +3,6 @@
 #include <net/xsk_buff_pool.h>
 #include <net/xdp_sock.h>
 #include <net/xdp_sock_drv.h>
-#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
-#include <linux/swiotlb.h>
 
 #include "xsk_queue.h"
 #include "xdp_umem.h"

From 8731745e4821b6738189b7ac490cf042fa9f1fe0 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Fri, 2 Oct 2020 18:42:17 -0500
Subject: [PATCH 15/62] bpf, verifier: Use fallthrough pseudo-keyword

Replace /* fallthrough */ comments with the new pseudo-keyword
macro fallthrough [1].

  [1] https://www.kernel.org/doc/html/v5.7/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20201002234217.GA12280@embeddedor
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d9dbf271ebab..01120acab09a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2673,7 +2673,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	case BPF_PROG_TYPE_CGROUP_SKB:
 		if (t == BPF_WRITE)
 			return false;
-		/* fallthrough */
+		fallthrough;
 
 	/* Program types with direct read + write access go here! */
 	case BPF_PROG_TYPE_SCHED_CLS:
@@ -5475,7 +5475,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		/* smin_val represents the known value */
 		if (known && smin_val == 0 && opcode == BPF_ADD)
 			break;
-		/* fall-through */
+		fallthrough;
 	case PTR_TO_PACKET_END:
 	case PTR_TO_SOCKET:
 	case PTR_TO_SOCKET_OR_NULL:

From 39d8f0d1026a990604770a658708f5845f7dbec0 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 5 Oct 2020 09:58:38 -0700
Subject: [PATCH 16/62] bpf: Use raw_spin_trylock() for pcpu_freelist_push/pop
 in NMI

Recent improvements in LOCKDEP highlighted a potential A-A deadlock with
pcpu_freelist in NMI:

./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi

[   18.984807] ================================
[   18.984807] WARNING: inconsistent lock state
[   18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted
[   18.984809] --------------------------------
[   18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage.
[   18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes:
[   18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at: __pcpu_freelist_pop+0xe3/0x180
[   18.984813] {INITIAL USE} state was registered at:
[   18.984814]   lock_acquire+0x175/0x7c0
[   18.984814]   _raw_spin_lock+0x2c/0x40
[   18.984815]   __pcpu_freelist_pop+0xe3/0x180
[   18.984815]   pcpu_freelist_pop+0x31/0x40
[   18.984816]   htab_map_alloc+0xbbf/0xf40
[   18.984816]   __do_sys_bpf+0x5aa/0x3ed0
[   18.984817]   do_syscall_64+0x2d/0x40
[   18.984818]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   18.984818] irq event stamp: 12
[...]
[   18.984822] other info that might help us debug this:
[   18.984823]  Possible unsafe locking scenario:
[   18.984823]
[   18.984824]        CPU0
[   18.984824]        ----
[   18.984824]   lock(&head->lock);
[   18.984826]   <Interrupt>
[   18.984826]     lock(&head->lock);
[   18.984827]
[   18.984828]  *** DEADLOCK ***
[   18.984828]
[   18.984829] 2 locks held by test_progs/1990:
[...]
[   18.984838]  <NMI>
[   18.984838]  dump_stack+0x9a/0xd0
[   18.984839]  lock_acquire+0x5c9/0x7c0
[   18.984839]  ? lock_release+0x6f0/0x6f0
[   18.984840]  ? __pcpu_freelist_pop+0xe3/0x180
[   18.984840]  _raw_spin_lock+0x2c/0x40
[   18.984841]  ? __pcpu_freelist_pop+0xe3/0x180
[   18.984841]  __pcpu_freelist_pop+0xe3/0x180
[   18.984842]  pcpu_freelist_pop+0x17/0x40
[   18.984842]  ? lock_release+0x6f0/0x6f0
[   18.984843]  __bpf_get_stackid+0x534/0xaf0
[   18.984843]  bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350
[   18.984844]  bpf_overflow_handler+0x12f/0x3f0

This is because pcpu_freelist_head.lock is accessed in both NMI and
non-NMI context. Fix this issue by using raw_spin_trylock() in NMI.

Since NMI interrupts non-NMI context, when NMI context tries to lock the
raw_spinlock, non-NMI context of the same CPU may already have locked a
lock and is blocked from unlocking the lock. For a system with N CPUs,
there could be N NMIs at the same time, and they may block N non-NMI
raw_spinlocks. This is tricky for pcpu_freelist_push(), where unlike
_pop(), failing _push() means leaking memory. This issue is more likely to
trigger in non-SMP system.

Fix this issue with an extra list, pcpu_freelist.extralist. The extralist
is primarily used to take _push() when raw_spin_trylock() failed on all
the per CPU lists. It should be empty most of the time. The following
table summarizes the behavior of pcpu_freelist in NMI and non-NMI:

non-NMI pop(): 	use _lock(); check per CPU lists first;
                if all per CPU lists are empty, check extralist;
                if extralist is empty, return NULL.

non-NMI push(): use _lock(); only push to per CPU lists.

NMI pop():    use _trylock(); check per CPU lists first;
              if all per CPU lists are locked or empty, check extralist;
              if extralist is locked or empty, return NULL.

NMI push():   use _trylock(); check per CPU lists first;
              if all per CPU lists are locked; try push to extralist;
              if extralist is also locked, keep trying on per CPU lists.

Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20201005165838.3735218-1-songliubraving@fb.com
---
 kernel/bpf/percpu_freelist.c | 101 +++++++++++++++++++++++++++++++++--
 kernel/bpf/percpu_freelist.h |   1 +
 2 files changed, 97 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index b367430e611c..3d897de89061 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
 		raw_spin_lock_init(&head->lock);
 		head->first = NULL;
 	}
+	raw_spin_lock_init(&s->extralist.lock);
+	s->extralist.first = NULL;
 	return 0;
 }
 
@@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
 	raw_spin_unlock(&head->lock);
 }
 
+static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
+						struct pcpu_freelist_node *node)
+{
+	if (!raw_spin_trylock(&s->extralist.lock))
+		return false;
+
+	pcpu_freelist_push_node(&s->extralist, node);
+	raw_spin_unlock(&s->extralist.lock);
+	return true;
+}
+
+static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
+					     struct pcpu_freelist_node *node)
+{
+	int cpu, orig_cpu;
+
+	orig_cpu = cpu = raw_smp_processor_id();
+	while (1) {
+		struct pcpu_freelist_head *head;
+
+		head = per_cpu_ptr(s->freelist, cpu);
+		if (raw_spin_trylock(&head->lock)) {
+			pcpu_freelist_push_node(head, node);
+			raw_spin_unlock(&head->lock);
+			return;
+		}
+		cpu = cpumask_next(cpu, cpu_possible_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = 0;
+
+		/* cannot lock any per cpu lock, try extralist */
+		if (cpu == orig_cpu &&
+		    pcpu_freelist_try_push_extra(s, node))
+			return;
+	}
+}
+
 void __pcpu_freelist_push(struct pcpu_freelist *s,
 			struct pcpu_freelist_node *node)
 {
-	struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
-
-	___pcpu_freelist_push(head, node);
+	if (in_nmi())
+		___pcpu_freelist_push_nmi(s, node);
+	else
+		___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
 }
 
 void pcpu_freelist_push(struct pcpu_freelist *s,
@@ -81,7 +121,7 @@ again:
 	}
 }
 
-struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
@@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
 		if (cpu == orig_cpu)
-			return NULL;
+			break;
 	}
+
+	/* per cpu lists are all empty, try extralist */
+	raw_spin_lock(&s->extralist.lock);
+	node = s->extralist.first;
+	if (node)
+		s->extralist.first = node->next;
+	raw_spin_unlock(&s->extralist.lock);
+	return node;
+}
+
+static struct pcpu_freelist_node *
+___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
+{
+	struct pcpu_freelist_head *head;
+	struct pcpu_freelist_node *node;
+	int orig_cpu, cpu;
+
+	orig_cpu = cpu = raw_smp_processor_id();
+	while (1) {
+		head = per_cpu_ptr(s->freelist, cpu);
+		if (raw_spin_trylock(&head->lock)) {
+			node = head->first;
+			if (node) {
+				head->first = node->next;
+				raw_spin_unlock(&head->lock);
+				return node;
+			}
+			raw_spin_unlock(&head->lock);
+		}
+		cpu = cpumask_next(cpu, cpu_possible_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = 0;
+		if (cpu == orig_cpu)
+			break;
+	}
+
+	/* cannot pop from per cpu lists, try extralist */
+	if (!raw_spin_trylock(&s->extralist.lock))
+		return NULL;
+	node = s->extralist.first;
+	if (node)
+		s->extralist.first = node->next;
+	raw_spin_unlock(&s->extralist.lock);
+	return node;
+}
+
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+	if (in_nmi())
+		return ___pcpu_freelist_pop_nmi(s);
+	return ___pcpu_freelist_pop(s);
 }
 
 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index fbf8a8a28979..3c76553cfe57 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -13,6 +13,7 @@ struct pcpu_freelist_head {
 
 struct pcpu_freelist {
 	struct pcpu_freelist_head __percpu *freelist;
+	struct pcpu_freelist_head extralist;
 };
 
 struct pcpu_freelist_node {

From dca4121cdc48027320c95cbce8f5e81160aa4afe Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 5 Oct 2020 15:36:48 -0700
Subject: [PATCH 17/62] bpf, doc: Update Andrii's email in MAINTAINERS

Update Andrii Nakryiko's reviewer email to kernel.org account. This optimizes
email logistics on my side and makes it less likely for me to miss important
patches.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20201005223648.2437130-1-andrii@kernel.org
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c1e946606dce..fd5d5507d229 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3258,7 +3258,7 @@ M:	Daniel Borkmann <daniel@iogearbox.net>
 R:	Martin KaFai Lau <kafai@fb.com>
 R:	Song Liu <songliubraving@fb.com>
 R:	Yonghong Song <yhs@fb.com>
-R:	Andrii Nakryiko <andriin@fb.com>
+R:	Andrii Nakryiko <andrii@kernel.org>
 R:	John Fastabend <john.fastabend@gmail.com>
 R:	KP Singh <kpsingh@chromium.org>
 L:	netdev@vger.kernel.org

From a0f2b7acb4b1d29127ff99c714233b973afd1411 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 6 Oct 2020 10:13:43 +0800
Subject: [PATCH 18/62] libbpf: Close map fd if init map slots failed

Previously we forgot to close the map fd if bpf_map_update_elem()
failed during map slot init, which will leak map fd.

Let's move map slot initialization to new function init_map_slots() to
simplify the code. And close the map fd if init slot failed.

Reported-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201006021345.3817033-2-liuhangbin@gmail.com
---
 tools/lib/bpf/libbpf.c | 55 ++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 9b36c52b8511..7cec237d9d1f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4203,6 +4203,36 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map)
 	return 0;
 }
 
+static int init_map_slots(struct bpf_map *map)
+{
+	const struct bpf_map *targ_map;
+	unsigned int i;
+	int fd, err;
+
+	for (i = 0; i < map->init_slots_sz; i++) {
+		if (!map->init_slots[i])
+			continue;
+
+		targ_map = map->init_slots[i];
+		fd = bpf_map__fd(targ_map);
+		err = bpf_map_update_elem(map->fd, &i, &fd, 0);
+		if (err) {
+			err = -errno;
+			pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n",
+				map->name, i, targ_map->name,
+				fd, err);
+			return err;
+		}
+		pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n",
+			 map->name, i, targ_map->name, fd);
+	}
+
+	zfree(&map->init_slots);
+	map->init_slots_sz = 0;
+
+	return 0;
+}
+
 static int
 bpf_object__create_maps(struct bpf_object *obj)
 {
@@ -4245,28 +4275,11 @@ bpf_object__create_maps(struct bpf_object *obj)
 		}
 
 		if (map->init_slots_sz) {
-			for (j = 0; j < map->init_slots_sz; j++) {
-				const struct bpf_map *targ_map;
-				int fd;
-
-				if (!map->init_slots[j])
-					continue;
-
-				targ_map = map->init_slots[j];
-				fd = bpf_map__fd(targ_map);
-				err = bpf_map_update_elem(map->fd, &j, &fd, 0);
-				if (err) {
-					err = -errno;
-					pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n",
-						map->name, j, targ_map->name,
-						fd, err);
-					goto err_out;
-				}
-				pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n",
-					 map->name, j, targ_map->name, fd);
+			err = init_map_slots(map);
+			if (err < 0) {
+				zclose(map->fd);
+				goto err_out;
 			}
-			zfree(&map->init_slots);
-			map->init_slots_sz = 0;
 		}
 
 		if (map->pin_path && !map->pinned) {

From 2c193d32caee83322a8511a88ba0cd097bba73f1 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 6 Oct 2020 10:13:44 +0800
Subject: [PATCH 19/62] libbpf: Check if pin_path was set even map fd exist

Say a user reuse map fd after creating a map manually and set the
pin_path, then load the object via libbpf.

In libbpf bpf_object__create_maps(), bpf_object__reuse_map() will
return 0 if there is no pinned map in map->pin_path. Then after
checking if map fd exist, we should also check if pin_path was set
and do bpf_map__pin() instead of continue the loop.

Fix it by creating map if fd not exist and continue checking pin_path
after that.

Suggested-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201006021345.3817033-3-liuhangbin@gmail.com
---
 tools/lib/bpf/libbpf.c | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7cec237d9d1f..9f90d1a686fd 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4256,29 +4256,28 @@ bpf_object__create_maps(struct bpf_object *obj)
 		if (map->fd >= 0) {
 			pr_debug("map '%s': skipping creation (preset fd=%d)\n",
 				 map->name, map->fd);
-			continue;
-		}
-
-		err = bpf_object__create_map(obj, map);
-		if (err)
-			goto err_out;
-
-		pr_debug("map '%s': created successfully, fd=%d\n", map->name,
-			 map->fd);
-
-		if (bpf_map__is_internal(map)) {
-			err = bpf_object__populate_internal_map(obj, map);
-			if (err < 0) {
-				zclose(map->fd);
+		} else {
+			err = bpf_object__create_map(obj, map);
+			if (err)
 				goto err_out;
+
+			pr_debug("map '%s': created successfully, fd=%d\n",
+				 map->name, map->fd);
+
+			if (bpf_map__is_internal(map)) {
+				err = bpf_object__populate_internal_map(obj, map);
+				if (err < 0) {
+					zclose(map->fd);
+					goto err_out;
+				}
 			}
-		}
 
-		if (map->init_slots_sz) {
-			err = init_map_slots(map);
-			if (err < 0) {
-				zclose(map->fd);
-				goto err_out;
+			if (map->init_slots_sz) {
+				err = init_map_slots(map);
+				if (err < 0) {
+					zclose(map->fd);
+					goto err_out;
+				}
 			}
 		}
 

From 44c4aa2bd1511ebd7f24d4a2eb7a26053d51d6ae Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 6 Oct 2020 10:13:45 +0800
Subject: [PATCH 20/62] selftest/bpf: Test pinning map with reused map fd

This add a test to make sure that we can still pin maps with
reused map fd.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201006021345.3817033-4-liuhangbin@gmail.com
---
 .../selftests/bpf/prog_tests/pinning.c        | 49 ++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c
index 041952524c55..fcf54b3a1dd0 100644
--- a/tools/testing/selftests/bpf/prog_tests/pinning.c
+++ b/tools/testing/selftests/bpf/prog_tests/pinning.c
@@ -37,7 +37,7 @@ void test_pinning(void)
 	struct stat statbuf = {};
 	struct bpf_object *obj;
 	struct bpf_map *map;
-	int err;
+	int err, map_fd;
 	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
 		.pin_root_path = custpath,
 	);
@@ -213,6 +213,53 @@ void test_pinning(void)
 	if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno))
 		goto out;
 
+	/* remove the custom pin path to re-test it with reuse fd below */
+	err = unlink(custpinpath);
+	if (CHECK(err, "unlink custpinpath", "err %d errno %d\n", err, errno))
+		goto out;
+
+	err = rmdir(custpath);
+	if (CHECK(err, "rmdir custpindir", "err %d errno %d\n", err, errno))
+		goto out;
+
+	bpf_object__close(obj);
+
+	/* test pinning at custom path with reuse fd */
+	obj = bpf_object__open_file(file, NULL);
+	err = libbpf_get_error(obj);
+	if (CHECK(err, "default open", "err %d errno %d\n", err, errno)) {
+		obj = NULL;
+		goto out;
+	}
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(__u32),
+				sizeof(__u64), 1, 0);
+	if (CHECK(map_fd < 0, "create pinmap manually", "fd %d\n", map_fd))
+		goto out;
+
+	map = bpf_object__find_map_by_name(obj, "pinmap");
+	if (CHECK(!map, "find map", "NULL map"))
+		goto close_map_fd;
+
+	err = bpf_map__reuse_fd(map, map_fd);
+	if (CHECK(err, "reuse pinmap fd", "err %d errno %d\n", err, errno))
+		goto close_map_fd;
+
+	err = bpf_map__set_pin_path(map, custpinpath);
+	if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno))
+		goto close_map_fd;
+
+	err = bpf_object__load(obj);
+	if (CHECK(err, "custom load", "err %d errno %d\n", err, errno))
+		goto close_map_fd;
+
+	/* check that pinmap was pinned at the custom path */
+	err = stat(custpinpath, &statbuf);
+	if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno))
+		goto close_map_fd;
+
+close_map_fd:
+	close(map_fd);
 out:
 	unlink(pinpath);
 	unlink(nopinpath);

From 8cee9107e72c93a52a7f2c58658c99887bdb05e0 Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <lrizzo@google.com>
Date: Mon, 5 Oct 2020 15:45:28 -0700
Subject: [PATCH 21/62] bpf, libbpf: Use valid btf in
 bpf_program__set_attach_target

bpf_program__set_attach_target(prog, fd, ...) will always fail when
fd = 0 (attach to a kernel symbol) because obj->btf_vmlinux is NULL
and there is no way to set it (at the moment btf_vmlinux is meant
to be temporary storage for use in bpf_object__load_xattr()).

Fix this by using libbpf_find_vmlinux_btf_id().

At some point we may want to opportunistically cache btf_vmlinux
so it can be reused with multiple programs.

Signed-off-by: Luigi Rizzo <lrizzo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Petar Penkov <ppenkov@google.com>
Link: https://lore.kernel.org/bpf/20201005224528.389097-1-lrizzo@google.com
---
 tools/lib/bpf/libbpf.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 9f90d1a686fd..faec389c4849 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10451,9 +10451,8 @@ int bpf_program__set_attach_target(struct bpf_program *prog,
 		btf_id = libbpf_find_prog_btf_id(attach_func_name,
 						 attach_prog_fd);
 	else
-		btf_id = __find_vmlinux_btf_id(prog->obj->btf_vmlinux,
-					       attach_func_name,
-					       prog->expected_attach_type);
+		btf_id = libbpf_find_vmlinux_btf_id(attach_func_name,
+						    prog->expected_attach_type);
 
 	if (btf_id < 0)
 		return btf_id;

From 9618bde489b2d23779b1e2d04c10983da21f3818 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 5 Oct 2020 21:34:26 -0700
Subject: [PATCH 22/62] samples/bpf: Change Makefile to cope with latest llvm

With latest llvm trunk, bpf programs under samples/bpf
directory, if using CORE, may experience the following
errors:

LLVM ERROR: Cannot select: intrinsic %llvm.preserve.struct.access.index
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace.
Stack dump:
0.      Program arguments: llc -march=bpf -filetype=obj -o samples/bpf/test_probe_write_user_kern.o
1.      Running pass 'Function Pass Manager' on module '<stdin>'.
2.      Running pass 'BPF DAG->DAG Pattern Instruction Selection' on function '@bpf_prog1'
 #0 0x000000000183c26c llvm::sys::PrintStackTrace(llvm::raw_ostream&, int)
    (/data/users/yhs/work/llvm-project/llvm/build.cur/install/bin/llc+0x183c26c)
...
 #7 0x00000000017c375e (/data/users/yhs/work/llvm-project/llvm/build.cur/install/bin/llc+0x17c375e)
 #8 0x00000000016a75c5 llvm::SelectionDAGISel::CannotYetSelect(llvm::SDNode*)
    (/data/users/yhs/work/llvm-project/llvm/build.cur/install/bin/llc+0x16a75c5)
 #9 0x00000000016ab4f8 llvm::SelectionDAGISel::SelectCodeCommon(llvm::SDNode*, unsigned char const*,
    unsigned int) (/data/users/yhs/work/llvm-project/llvm/build.cur/install/bin/llc+0x16ab4f8)
...
Aborted (core dumped) | llc -march=bpf -filetype=obj -o samples/bpf/test_probe_write_user_kern.o

The reason is due to llvm change https://reviews.llvm.org/D87153
where the CORE relocation global generation is moved from the beginning
of target dependent optimization (llc) to the beginning
of target independent optimization (opt).

Since samples/bpf programs did not use vmlinux.h and its clang compilation
uses native architecture, we need to adjust arch triple at opt level
to do CORE relocation global generation properly. Otherwise, the above
error will appear.

This patch fixed the issue by introduce opt and llvm-dis to compilation chain,
which will do proper CORE relocation global generation as well as O2 level
optimization. Tested with llvm10, llvm11 and trunk/llvm12.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20201006043427.1891742-1-yhs@fb.com
---
 samples/bpf/Makefile | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4f1ed0e3cf9f..e29b1e89dd3e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -211,6 +211,8 @@ TPROGLDLIBS_xsk_fwd		+= -pthread
 #  make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
 LLC ?= llc
 CLANG ?= clang
+OPT ?= opt
+LLVM_DIS ?= llvm-dis
 LLVM_OBJCOPY ?= llvm-objcopy
 BTF_PAHOLE ?= pahole
 
@@ -303,6 +305,11 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
 # asm/sysreg.h - inline assembly used by it is incompatible with llvm.
 # But, there is no easy way to fix it, so just exclude it since it is
 # useless for BPF samples.
+# below we use long chain of commands, clang | opt | llvm-dis | llc,
+# to generate final object file. 'clang' compiles the source into IR
+# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin
+# processing (llvm12) and IR optimizations. 'llvm-dis' converts
+# 'opt' output to IR, and finally 'llc' generates bpf byte code.
 $(obj)/%.o: $(src)/%.c
 	@echo "  CLANG-bpf " $@
 	$(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \
@@ -314,7 +321,9 @@ $(obj)/%.o: $(src)/%.c
 		-Wno-address-of-packed-member -Wno-tautological-compare \
 		-Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
 		-I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
-		-O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
+		-O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \
+		$(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \
+		$(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
 ifeq ($(DWARF2BTF),y)
 	$(BTF_PAHOLE) -J $@
 endif

From 544d6adf3c3dddbf80e5757a3098c63612f5f8f8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 5 Oct 2020 21:34:27 -0700
Subject: [PATCH 23/62] samples/bpf: Fix a compilation error with fallthrough
 marking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling samples/bpf hits an error related to fallthrough marking.
    ...
    CC  samples/bpf/hbm.o
  samples/bpf/hbm.c: In function ‘main’:
  samples/bpf/hbm.c:486:4: error: ‘fallthrough’ undeclared (first use in this function)
      fallthrough;
      ^~~~~~~~~~~

The "fallthrough" is not defined under tools/include directory.
Rather, it is "__fallthrough" is defined in linux/compiler.h.
Including "linux/compiler.h" and using "__fallthrough" fixed the issue.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201006043427.1891805-1-yhs@fb.com
---
 samples/bpf/hbm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
index 4b22ace52f80..ff4c533dfac2 100644
--- a/samples/bpf/hbm.c
+++ b/samples/bpf/hbm.c
@@ -40,6 +40,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/unistd.h>
+#include <linux/compiler.h>
 
 #include <linux/bpf.h>
 #include <bpf/bpf.h>
@@ -483,7 +484,7 @@ int main(int argc, char **argv)
 					"Option -%c requires an argument.\n\n",
 					optopt);
 		case 'h':
-			fallthrough;
+			__fallthrough;
 		default:
 			Usage();
 			return 0;

From 2e8806f032f5d9d5c59b790c73b83842bda78f23 Mon Sep 17 00:00:00 2001
From: Ciara Loftus <ciara.loftus@intel.com>
Date: Fri, 2 Oct 2020 13:36:10 +0000
Subject: [PATCH 24/62] samples: bpf: Split xdpsock stats into new struct

New statistics will be added in future commits. In preparation for this,
let's split out the existing statistics into their own struct.

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20201002133612.31536-1-ciara.loftus@intel.com
---
 samples/bpf/xdpsock_user.c | 121 +++++++++++++++++++++----------------
 1 file changed, 68 insertions(+), 53 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index b220173dbe1e..4c5022e6479e 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -91,18 +91,7 @@ static bool opt_need_wakeup = true;
 static u32 opt_num_xsks = 1;
 static u32 prog_id;
 
-struct xsk_umem_info {
-	struct xsk_ring_prod fq;
-	struct xsk_ring_cons cq;
-	struct xsk_umem *umem;
-	void *buffer;
-};
-
-struct xsk_socket_info {
-	struct xsk_ring_cons rx;
-	struct xsk_ring_prod tx;
-	struct xsk_umem_info *umem;
-	struct xsk_socket *xsk;
+struct xsk_ring_stats {
 	unsigned long rx_npkts;
 	unsigned long tx_npkts;
 	unsigned long rx_dropped_npkts;
@@ -119,6 +108,21 @@ struct xsk_socket_info {
 	unsigned long prev_rx_full_npkts;
 	unsigned long prev_rx_fill_empty_npkts;
 	unsigned long prev_tx_empty_npkts;
+};
+
+struct xsk_umem_info {
+	struct xsk_ring_prod fq;
+	struct xsk_ring_cons cq;
+	struct xsk_umem *umem;
+	void *buffer;
+};
+
+struct xsk_socket_info {
+	struct xsk_ring_cons rx;
+	struct xsk_ring_prod tx;
+	struct xsk_umem_info *umem;
+	struct xsk_socket *xsk;
+	struct xsk_ring_stats ring_stats;
 	u32 outstanding_tx;
 };
 
@@ -173,12 +177,12 @@ static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk)
 		return err;
 
 	if (optlen == sizeof(struct xdp_statistics)) {
-		xsk->rx_dropped_npkts = stats.rx_dropped;
-		xsk->rx_invalid_npkts = stats.rx_invalid_descs;
-		xsk->tx_invalid_npkts = stats.tx_invalid_descs;
-		xsk->rx_full_npkts = stats.rx_ring_full;
-		xsk->rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs;
-		xsk->tx_empty_npkts = stats.tx_ring_empty_descs;
+		xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped;
+		xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs;
+		xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs;
+		xsk->ring_stats.rx_full_npkts = stats.rx_ring_full;
+		xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs;
+		xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs;
 		return 0;
 	}
 
@@ -198,9 +202,9 @@ static void dump_stats(void)
 		double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps,
 			tx_invalid_pps, tx_empty_pps;
 
-		rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
+		rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) *
 			 1000000000. / dt;
-		tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
+		tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) *
 			 1000000000. / dt;
 
 		printf("\n sock%d@", i);
@@ -209,47 +213,58 @@ static void dump_stats(void)
 
 		printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
 		       dt / 1000000000.);
-		printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
-		printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
+		printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts);
+		printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts);
 
-		xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
-		xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
+		xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts;
+		xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts;
 
 		if (opt_extra_stats) {
 			if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) {
-				dropped_pps = (xsks[i]->rx_dropped_npkts -
-						xsks[i]->prev_rx_dropped_npkts) * 1000000000. / dt;
-				rx_invalid_pps = (xsks[i]->rx_invalid_npkts -
-						xsks[i]->prev_rx_invalid_npkts) * 1000000000. / dt;
-				tx_invalid_pps = (xsks[i]->tx_invalid_npkts -
-						xsks[i]->prev_tx_invalid_npkts) * 1000000000. / dt;
-				full_pps = (xsks[i]->rx_full_npkts -
-						xsks[i]->prev_rx_full_npkts) * 1000000000. / dt;
-				fill_empty_pps = (xsks[i]->rx_fill_empty_npkts -
-						xsks[i]->prev_rx_fill_empty_npkts)
-						* 1000000000. / dt;
-				tx_empty_pps = (xsks[i]->tx_empty_npkts -
-						xsks[i]->prev_tx_empty_npkts) * 1000000000. / dt;
+				dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts -
+						xsks[i]->ring_stats.prev_rx_dropped_npkts) *
+							1000000000. / dt;
+				rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts -
+						xsks[i]->ring_stats.prev_rx_invalid_npkts) *
+							1000000000. / dt;
+				tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts -
+						xsks[i]->ring_stats.prev_tx_invalid_npkts) *
+							1000000000. / dt;
+				full_pps = (xsks[i]->ring_stats.rx_full_npkts -
+						xsks[i]->ring_stats.prev_rx_full_npkts) *
+							1000000000. / dt;
+				fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts -
+						xsks[i]->ring_stats.prev_rx_fill_empty_npkts) *
+							1000000000. / dt;
+				tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts -
+						xsks[i]->ring_stats.prev_tx_empty_npkts) *
+							1000000000. / dt;
 
 				printf(fmt, "rx dropped", dropped_pps,
-				       xsks[i]->rx_dropped_npkts);
+				       xsks[i]->ring_stats.rx_dropped_npkts);
 				printf(fmt, "rx invalid", rx_invalid_pps,
-				       xsks[i]->rx_invalid_npkts);
+				       xsks[i]->ring_stats.rx_invalid_npkts);
 				printf(fmt, "tx invalid", tx_invalid_pps,
-				       xsks[i]->tx_invalid_npkts);
+				       xsks[i]->ring_stats.tx_invalid_npkts);
 				printf(fmt, "rx queue full", full_pps,
-				       xsks[i]->rx_full_npkts);
+				       xsks[i]->ring_stats.rx_full_npkts);
 				printf(fmt, "fill ring empty", fill_empty_pps,
-				       xsks[i]->rx_fill_empty_npkts);
+				       xsks[i]->ring_stats.rx_fill_empty_npkts);
 				printf(fmt, "tx ring empty", tx_empty_pps,
-				       xsks[i]->tx_empty_npkts);
+				       xsks[i]->ring_stats.tx_empty_npkts);
 
-				xsks[i]->prev_rx_dropped_npkts = xsks[i]->rx_dropped_npkts;
-				xsks[i]->prev_rx_invalid_npkts = xsks[i]->rx_invalid_npkts;
-				xsks[i]->prev_tx_invalid_npkts = xsks[i]->tx_invalid_npkts;
-				xsks[i]->prev_rx_full_npkts = xsks[i]->rx_full_npkts;
-				xsks[i]->prev_rx_fill_empty_npkts = xsks[i]->rx_fill_empty_npkts;
-				xsks[i]->prev_tx_empty_npkts = xsks[i]->tx_empty_npkts;
+				xsks[i]->ring_stats.prev_rx_dropped_npkts =
+					xsks[i]->ring_stats.rx_dropped_npkts;
+				xsks[i]->ring_stats.prev_rx_invalid_npkts =
+					xsks[i]->ring_stats.rx_invalid_npkts;
+				xsks[i]->ring_stats.prev_tx_invalid_npkts =
+					xsks[i]->ring_stats.tx_invalid_npkts;
+				xsks[i]->ring_stats.prev_rx_full_npkts =
+					xsks[i]->ring_stats.rx_full_npkts;
+				xsks[i]->ring_stats.prev_rx_fill_empty_npkts =
+					xsks[i]->ring_stats.rx_fill_empty_npkts;
+				xsks[i]->ring_stats.prev_tx_empty_npkts =
+					xsks[i]->ring_stats.tx_empty_npkts;
 			} else {
 				printf("%-15s\n", "Error retrieving extra stats");
 			}
@@ -936,7 +951,7 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
 		xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
 		xsk_ring_cons__release(&xsk->umem->cq, rcvd);
 		xsk->outstanding_tx -= rcvd;
-		xsk->tx_npkts += rcvd;
+		xsk->ring_stats.tx_npkts += rcvd;
 	}
 }
 
@@ -956,7 +971,7 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk,
 	if (rcvd > 0) {
 		xsk_ring_cons__release(&xsk->umem->cq, rcvd);
 		xsk->outstanding_tx -= rcvd;
-		xsk->tx_npkts += rcvd;
+		xsk->ring_stats.tx_npkts += rcvd;
 	}
 }
 
@@ -996,7 +1011,7 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 
 	xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
 	xsk_ring_cons__release(&xsk->rx, rcvd);
-	xsk->rx_npkts += rcvd;
+	xsk->ring_stats.rx_npkts += rcvd;
 }
 
 static void rx_drop_all(void)
@@ -1155,7 +1170,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
 	xsk_ring_prod__submit(&xsk->tx, rcvd);
 	xsk_ring_cons__release(&xsk->rx, rcvd);
 
-	xsk->rx_npkts += rcvd;
+	xsk->ring_stats.rx_npkts += rcvd;
 	xsk->outstanding_tx += rcvd;
 }
 

From 60dc609dbd545226e00c9e7b518d0f4873f0a3e1 Mon Sep 17 00:00:00 2001
From: Ciara Loftus <ciara.loftus@intel.com>
Date: Fri, 2 Oct 2020 13:36:11 +0000
Subject: [PATCH 25/62] samples: bpf: Count syscalls in xdpsock

Categorise and record syscalls issued in the xdpsock sample app. The
categories recorded are:

  rx_empty_polls:    polls when the rx ring is empty
  fill_fail_polls:   polls when failed to get addr from fill ring
  copy_tx_sendtos:   sendtos issued for tx when copy mode enabled
  tx_wakeup_sendtos: sendtos issued when tx ring needs waking up
  opt_polls:         polls issued since the '-p' flag is set

Print the stats using '-a' on the xdpsock command line.

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20201002133612.31536-2-ciara.loftus@intel.com
---
 samples/bpf/xdpsock_user.c | 113 +++++++++++++++++++++++++++++++++----
 1 file changed, 103 insertions(+), 10 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 4c5022e6479e..ff119ede4ab1 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -79,6 +79,7 @@ static u16 opt_pkt_size = MIN_PKT_SIZE;
 static u32 opt_pkt_fill_pattern = 0x12345678;
 static bool opt_extra_stats;
 static bool opt_quiet;
+static bool opt_app_stats;
 static int opt_poll;
 static int opt_interval = 1;
 static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
@@ -110,6 +111,19 @@ struct xsk_ring_stats {
 	unsigned long prev_tx_empty_npkts;
 };
 
+struct xsk_app_stats {
+	unsigned long rx_empty_polls;
+	unsigned long fill_fail_polls;
+	unsigned long copy_tx_sendtos;
+	unsigned long tx_wakeup_sendtos;
+	unsigned long opt_polls;
+	unsigned long prev_rx_empty_polls;
+	unsigned long prev_fill_fail_polls;
+	unsigned long prev_copy_tx_sendtos;
+	unsigned long prev_tx_wakeup_sendtos;
+	unsigned long prev_opt_polls;
+};
+
 struct xsk_umem_info {
 	struct xsk_ring_prod fq;
 	struct xsk_ring_cons cq;
@@ -123,6 +137,7 @@ struct xsk_socket_info {
 	struct xsk_umem_info *umem;
 	struct xsk_socket *xsk;
 	struct xsk_ring_stats ring_stats;
+	struct xsk_app_stats app_stats;
 	u32 outstanding_tx;
 };
 
@@ -189,6 +204,45 @@ static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk)
 	return -EINVAL;
 }
 
+static void dump_app_stats(long dt)
+{
+	int i;
+
+	for (i = 0; i < num_socks && xsks[i]; i++) {
+		char *fmt = "%-18s %'-14.0f %'-14lu\n";
+		double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps,
+				tx_wakeup_sendtos_ps, opt_polls_ps;
+
+		rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls -
+					xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt;
+		fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls -
+					xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt;
+		copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos -
+					xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt;
+		tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos -
+					xsks[i]->app_stats.prev_tx_wakeup_sendtos)
+										* 1000000000. / dt;
+		opt_polls_ps = (xsks[i]->app_stats.opt_polls -
+					xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt;
+
+		printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count");
+		printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls);
+		printf(fmt, "fill fail polls", fill_fail_polls_ps,
+							xsks[i]->app_stats.fill_fail_polls);
+		printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps,
+							xsks[i]->app_stats.copy_tx_sendtos);
+		printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps,
+							xsks[i]->app_stats.tx_wakeup_sendtos);
+		printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls);
+
+		xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls;
+		xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls;
+		xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos;
+		xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos;
+		xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls;
+	}
+}
+
 static void dump_stats(void)
 {
 	unsigned long now = get_nsecs();
@@ -198,7 +252,7 @@ static void dump_stats(void)
 	prev_time = now;
 
 	for (i = 0; i < num_socks && xsks[i]; i++) {
-		char *fmt = "%-15s %'-11.0f %'-11lu\n";
+		char *fmt = "%-18s %'-14.0f %'-14lu\n";
 		double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps,
 			tx_invalid_pps, tx_empty_pps;
 
@@ -211,7 +265,7 @@ static void dump_stats(void)
 		print_benchmark(false);
 		printf("\n");
 
-		printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
+		printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts",
 		       dt / 1000000000.);
 		printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts);
 		printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts);
@@ -270,6 +324,9 @@ static void dump_stats(void)
 			}
 		}
 	}
+
+	if (opt_app_stats)
+		dump_app_stats(dt);
 }
 
 static bool is_benchmark_done(void)
@@ -708,6 +765,17 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
 	if (ret)
 		exit_with_error(-ret);
 
+	xsk->app_stats.rx_empty_polls = 0;
+	xsk->app_stats.fill_fail_polls = 0;
+	xsk->app_stats.copy_tx_sendtos = 0;
+	xsk->app_stats.tx_wakeup_sendtos = 0;
+	xsk->app_stats.opt_polls = 0;
+	xsk->app_stats.prev_rx_empty_polls = 0;
+	xsk->app_stats.prev_fill_fail_polls = 0;
+	xsk->app_stats.prev_copy_tx_sendtos = 0;
+	xsk->app_stats.prev_tx_wakeup_sendtos = 0;
+	xsk->app_stats.prev_opt_polls = 0;
+
 	return xsk;
 }
 
@@ -735,6 +803,7 @@ static struct option long_options[] = {
 	{"tx-pkt-pattern", required_argument, 0, 'P'},
 	{"extra-stats", no_argument, 0, 'x'},
 	{"quiet", no_argument, 0, 'Q'},
+	{"app-stats", no_argument, 0, 'a'},
 	{0, 0, 0, 0}
 };
 
@@ -771,6 +840,7 @@ static void usage(const char *prog)
 		"  -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n"
 		"  -x, --extra-stats	Display extra statistics.\n"
 		"  -Q, --quiet          Do not display any stats.\n"
+		"  -a, --app-stats	Display application (syscall) statistics.\n"
 		"\n";
 	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
 		opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE,
@@ -786,7 +856,7 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQ",
+		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQa",
 				long_options, &option_index);
 		if (c == -1)
 			break;
@@ -873,6 +943,9 @@ static void parse_command_line(int argc, char **argv)
 		case 'Q':
 			opt_quiet = 1;
 			break;
+		case 'a':
+			opt_app_stats = 1;
+			break;
 		default:
 			usage(basename(argv[0]));
 		}
@@ -923,8 +996,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
 	 * is driven by the NAPI loop. So as an optimization, we do not have to call
 	 * sendto() all the time in zero-copy mode for l2fwd.
 	 */
-	if (opt_xdp_bind_flags & XDP_COPY)
+	if (opt_xdp_bind_flags & XDP_COPY) {
+		xsk->app_stats.copy_tx_sendtos++;
 		kick_tx(xsk);
+	}
 
 	ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size :
 		xsk->outstanding_tx;
@@ -939,8 +1014,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
 		while (ret != rcvd) {
 			if (ret < 0)
 				exit_with_error(-ret);
-			if (xsk_ring_prod__needs_wakeup(&umem->fq))
+			if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
+				xsk->app_stats.fill_fail_polls++;
 				ret = poll(fds, num_socks, opt_timeout);
+			}
 			ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
 		}
 
@@ -964,8 +1041,10 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk,
 	if (!xsk->outstanding_tx)
 		return;
 
-	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) {
+		xsk->app_stats.tx_wakeup_sendtos++;
 		kick_tx(xsk);
+	}
 
 	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
 	if (rcvd > 0) {
@@ -983,8 +1062,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 
 	rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
 	if (!rcvd) {
-		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+			xsk->app_stats.rx_empty_polls++;
 			ret = poll(fds, num_socks, opt_timeout);
+		}
 		return;
 	}
 
@@ -992,8 +1073,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 	while (ret != rcvd) {
 		if (ret < 0)
 			exit_with_error(-ret);
-		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+			xsk->app_stats.fill_fail_polls++;
 			ret = poll(fds, num_socks, opt_timeout);
+		}
 		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
 	}
 
@@ -1026,6 +1109,8 @@ static void rx_drop_all(void)
 
 	for (;;) {
 		if (opt_poll) {
+			for (i = 0; i < num_socks; i++)
+				xsks[i]->app_stats.opt_polls++;
 			ret = poll(fds, num_socks, opt_timeout);
 			if (ret <= 0)
 				continue;
@@ -1106,6 +1191,8 @@ static void tx_only_all(void)
 		int batch_size = get_batch_size(pkt_cnt);
 
 		if (opt_poll) {
+			for (i = 0; i < num_socks; i++)
+				xsks[i]->app_stats.opt_polls++;
 			ret = poll(fds, num_socks, opt_timeout);
 			if (ret <= 0)
 				continue;
@@ -1137,8 +1224,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
 
 	rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
 	if (!rcvd) {
-		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+			xsk->app_stats.rx_empty_polls++;
 			ret = poll(fds, num_socks, opt_timeout);
+		}
 		return;
 	}
 
@@ -1147,8 +1236,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
 		if (ret < 0)
 			exit_with_error(-ret);
 		complete_tx_l2fwd(xsk, fds);
-		if (xsk_ring_prod__needs_wakeup(&xsk->tx))
+		if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
+			xsk->app_stats.tx_wakeup_sendtos++;
 			kick_tx(xsk);
+		}
 		ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
 	}
 
@@ -1186,6 +1277,8 @@ static void l2fwd_all(void)
 
 	for (;;) {
 		if (opt_poll) {
+			for (i = 0; i < num_socks; i++)
+				xsks[i]->app_stats.opt_polls++;
 			ret = poll(fds, num_socks, opt_timeout);
 			if (ret <= 0)
 				continue;

From 67ed375530e223624508382a9348a21c3b2ccc09 Mon Sep 17 00:00:00 2001
From: Ciara Loftus <ciara.loftus@intel.com>
Date: Fri, 2 Oct 2020 13:36:12 +0000
Subject: [PATCH 26/62] samples: bpf: Driver interrupt statistics in xdpsock

Add an option to count the number of interrupts generated per second and
total number of interrupts during the lifetime of the application for a
given interface. This information is extracted from /proc/interrupts. Since
there is no naming convention across drivers, the user must provide the
string which is specific to their interface in the /proc/interrupts file on
the command line.

Usage:

./xdpsock ... -I <irq_str>

eg. for queue 0 of i40e device eth0:

./xdpsock ... -I i40e-eth0-TxRx-0

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201002133612.31536-3-ciara.loftus@intel.com
---
 samples/bpf/xdpsock_user.c | 120 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 119 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index ff119ede4ab1..1149e94ca32f 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -11,6 +11,7 @@
 #include <linux/if_xdp.h>
 #include <linux/if_ether.h>
 #include <linux/ip.h>
+#include <linux/limits.h>
 #include <linux/udp.h>
 #include <arpa/inet.h>
 #include <locale.h>
@@ -80,6 +81,9 @@ static u32 opt_pkt_fill_pattern = 0x12345678;
 static bool opt_extra_stats;
 static bool opt_quiet;
 static bool opt_app_stats;
+static const char *opt_irq_str = "";
+static u32 irq_no;
+static int irqs_at_init = -1;
 static int opt_poll;
 static int opt_interval = 1;
 static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
@@ -111,6 +115,11 @@ struct xsk_ring_stats {
 	unsigned long prev_tx_empty_npkts;
 };
 
+struct xsk_driver_stats {
+	unsigned long intrs;
+	unsigned long prev_intrs;
+};
+
 struct xsk_app_stats {
 	unsigned long rx_empty_polls;
 	unsigned long fill_fail_polls;
@@ -138,6 +147,7 @@ struct xsk_socket_info {
 	struct xsk_socket *xsk;
 	struct xsk_ring_stats ring_stats;
 	struct xsk_app_stats app_stats;
+	struct xsk_driver_stats drv_stats;
 	u32 outstanding_tx;
 };
 
@@ -243,6 +253,100 @@ static void dump_app_stats(long dt)
 	}
 }
 
+static bool get_interrupt_number(void)
+{
+	FILE *f_int_proc;
+	char line[4096];
+	bool found = false;
+
+	f_int_proc = fopen("/proc/interrupts", "r");
+	if (f_int_proc == NULL) {
+		printf("Failed to open /proc/interrupts.\n");
+		return found;
+	}
+
+	while (!feof(f_int_proc) && !found) {
+		/* Make sure to read a full line at a time */
+		if (fgets(line, sizeof(line), f_int_proc) == NULL ||
+				line[strlen(line) - 1] != '\n') {
+			printf("Error reading from interrupts file\n");
+			break;
+		}
+
+		/* Extract interrupt number from line */
+		if (strstr(line, opt_irq_str) != NULL) {
+			irq_no = atoi(line);
+			found = true;
+			break;
+		}
+	}
+
+	fclose(f_int_proc);
+
+	return found;
+}
+
+static int get_irqs(void)
+{
+	char count_path[PATH_MAX];
+	int total_intrs = -1;
+	FILE *f_count_proc;
+	char line[4096];
+
+	snprintf(count_path, sizeof(count_path),
+		"/sys/kernel/irq/%i/per_cpu_count", irq_no);
+	f_count_proc = fopen(count_path, "r");
+	if (f_count_proc == NULL) {
+		printf("Failed to open %s\n", count_path);
+		return total_intrs;
+	}
+
+	if (fgets(line, sizeof(line), f_count_proc) == NULL ||
+			line[strlen(line) - 1] != '\n') {
+		printf("Error reading from %s\n", count_path);
+	} else {
+		static const char com[2] = ",";
+		char *token;
+
+		total_intrs = 0;
+		token = strtok(line, com);
+		while (token != NULL) {
+			/* sum up interrupts across all cores */
+			total_intrs += atoi(token);
+			token = strtok(NULL, com);
+		}
+	}
+
+	fclose(f_count_proc);
+
+	return total_intrs;
+}
+
+static void dump_driver_stats(long dt)
+{
+	int i;
+
+	for (i = 0; i < num_socks && xsks[i]; i++) {
+		char *fmt = "%-18s %'-14.0f %'-14lu\n";
+		double intrs_ps;
+		int n_ints = get_irqs();
+
+		if (n_ints < 0) {
+			printf("error getting intr info for intr %i\n", irq_no);
+			return;
+		}
+		xsks[i]->drv_stats.intrs = n_ints - irqs_at_init;
+
+		intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) *
+			 1000000000. / dt;
+
+		printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count");
+		printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs);
+
+		xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs;
+	}
+}
+
 static void dump_stats(void)
 {
 	unsigned long now = get_nsecs();
@@ -327,6 +431,8 @@ static void dump_stats(void)
 
 	if (opt_app_stats)
 		dump_app_stats(dt);
+	if (irq_no)
+		dump_driver_stats(dt);
 }
 
 static bool is_benchmark_done(void)
@@ -804,6 +910,7 @@ static struct option long_options[] = {
 	{"extra-stats", no_argument, 0, 'x'},
 	{"quiet", no_argument, 0, 'Q'},
 	{"app-stats", no_argument, 0, 'a'},
+	{"irq-string", no_argument, 0, 'I'},
 	{0, 0, 0, 0}
 };
 
@@ -841,6 +948,7 @@ static void usage(const char *prog)
 		"  -x, --extra-stats	Display extra statistics.\n"
 		"  -Q, --quiet          Do not display any stats.\n"
 		"  -a, --app-stats	Display application (syscall) statistics.\n"
+		"  -I, --irq-string	Display driver interrupt statistics for interface associated with irq-string.\n"
 		"\n";
 	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
 		opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE,
@@ -856,7 +964,7 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQa",
+		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:",
 				long_options, &option_index);
 		if (c == -1)
 			break;
@@ -945,6 +1053,16 @@ static void parse_command_line(int argc, char **argv)
 			break;
 		case 'a':
 			opt_app_stats = 1;
+			break;
+		case 'I':
+			opt_irq_str = optarg;
+			if (get_interrupt_number())
+				irqs_at_init = get_irqs();
+			if (irqs_at_init < 0) {
+				fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str);
+				usage(basename(argv[0]));
+			}
+
 			break;
 		default:
 			usage(basename(argv[0]));

From bf88a80a0407e67f21ceafbf03828086f149dfad Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Tue, 6 Oct 2020 19:28:57 -0700
Subject: [PATCH 27/62] selftests/bpf: Fix test_verifier after introducing
 resolve_pseudo_ldimm64

Commit 4976b718c355 ("bpf: Introduce pseudo_btf_id") switched
the order of check_subprogs() and resolve_pseudo_ldimm() in
the verifier. Now an empty prog expects to see the error "last
insn is not an the prog of a single invalid ldimm exit or jmp"
instead, because the check for subprogs comes first. It's now
pointless to validate that half of ldimm64 won't be the last
instruction.

Tested:
 # ./test_verifier
 Summary: 1129 PASSED, 537 SKIPPED, 0 FAILED
 and the full set of bpf selftests.

Fixes: 4976b718c355 ("bpf: Introduce pseudo_btf_id")
Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201007022857.2791884-1-haoluo@google.com
---
 tools/testing/selftests/bpf/verifier/basic.c    | 2 +-
 tools/testing/selftests/bpf/verifier/ld_imm64.c | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/verifier/basic.c b/tools/testing/selftests/bpf/verifier/basic.c
index b8d18642653a..de84f0d57082 100644
--- a/tools/testing/selftests/bpf/verifier/basic.c
+++ b/tools/testing/selftests/bpf/verifier/basic.c
@@ -2,7 +2,7 @@
 	"empty prog",
 	.insns = {
 	},
-	.errstr = "unknown opcode 00",
+	.errstr = "last insn is not an exit or jmp",
 	.result = REJECT,
 },
 {
diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c
index 3856dba733e9..f9297900cea6 100644
--- a/tools/testing/selftests/bpf/verifier/ld_imm64.c
+++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c
@@ -50,14 +50,6 @@
 	.errstr = "invalid bpf_ld_imm64 insn",
 	.result = REJECT,
 },
-{
-	"test5 ld_imm64",
-	.insns = {
-	BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
-	},
-	.errstr = "invalid bpf_ld_imm64 insn",
-	.result = REJECT,
-},
 {
 	"test6 ld_imm64",
 	.insns = {

From 49a2a4d4163f95e36e2bec50e06813b73401bc76 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 6 Oct 2020 19:16:13 -0700
Subject: [PATCH 28/62] kernel/bpf/verifier: Fix build when NET is not enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix build errors in kernel/bpf/verifier.c when CONFIG_NET is
not enabled.

../kernel/bpf/verifier.c:3995:13: error: ‘btf_sock_ids’ undeclared here (not in a function); did you mean ‘bpf_sock_ops’?
  .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],

../kernel/bpf/verifier.c:3995:26: error: ‘BTF_SOCK_TYPE_SOCK_COMMON’ undeclared here (not in a function); did you mean ‘PTR_TO_SOCK_COMMON’?
  .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],

Fixes: 1df8f55a37bd ("bpf: Enable bpf_skc_to_* sock casting helper to networking prog type")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20201007021613.13646-1-rdunlap@infradead.org
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 01120acab09a..62b804651a48 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3984,6 +3984,7 @@ static const struct bpf_reg_types sock_types = {
 	},
 };
 
+#ifdef CONFIG_NET
 static const struct bpf_reg_types btf_id_sock_common_types = {
 	.types = {
 		PTR_TO_SOCK_COMMON,
@@ -3994,6 +3995,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = {
 	},
 	.btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
 };
+#endif
 
 static const struct bpf_reg_types mem_types = {
 	.types = {
@@ -4037,7 +4039,9 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_CTX]		= &context_types,
 	[ARG_PTR_TO_CTX_OR_NULL]	= &context_types,
 	[ARG_PTR_TO_SOCK_COMMON]	= &sock_types,
+#ifdef CONFIG_NET
 	[ARG_PTR_TO_BTF_ID_SOCK_COMMON]	= &btf_id_sock_common_types,
+#endif
 	[ARG_PTR_TO_SOCKET]		= &fullsock_types,
 	[ARG_PTR_TO_SOCKET_OR_NULL]	= &fullsock_types,
 	[ARG_PTR_TO_BTF_ID]		= &btf_ptr_types,

From ebfb4d40ed9dd45d2d6780f58d0cd9336d647890 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 6 Oct 2020 23:29:33 -0700
Subject: [PATCH 29/62] bpf: Fix build failure for kernel/trace/bpf_trace.c
 with CONFIG_NET=n

When CONFIG_NET is not defined, I hit the following build error:
    kernel/trace/bpf_trace.o:(.rodata+0x110): undefined reference to `bpf_prog_test_run_raw_tp'

Commit 1b4d60ec162f ("bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint")
added test_run support for raw_tracepoint in /kernel/trace/bpf_trace.c.
But the test_run function bpf_prog_test_run_raw_tp is defined in
net/bpf/test_run.c, only available with CONFIG_NET=y.

Adding a CONFIG_NET guard for
    .test_run = bpf_prog_test_run_raw_tp;
fixed the above build issue.

Fixes: 1b4d60ec162f ("bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201007062933.3425899-1-yhs@fb.com
---
 kernel/trace/bpf_trace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a136a6a63a71..a2a4535b6277 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1780,7 +1780,9 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
 };
 
 const struct bpf_prog_ops raw_tracepoint_prog_ops = {
+#ifdef CONFIG_NET
 	.test_run = bpf_prog_test_run_raw_tp,
+#endif
 };
 
 const struct bpf_verifier_ops tracing_verifier_ops = {

From 49f3d12b0f70ea867b891ad2a97f6e51bb564e18 Mon Sep 17 00:00:00 2001
From: Jakub Wilk <jwilk@jwilk.net>
Date: Wed, 7 Oct 2020 07:57:17 +0200
Subject: [PATCH 30/62] bpf: Fix typo in uapi/linux/bpf.h

Reported-by: Samanta Navarro <ferivoz@riseup.net>
Signed-off-by: Jakub Wilk <jwilk@jwilk.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20201007055717.7319-1-jwilk@jwilk.net
---
 include/uapi/linux/bpf.h       | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c446394135be..d83561e8cd2c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2253,7 +2253,7 @@ union bpf_attr {
  *	Description
  *		This helper is used in programs implementing policies at the
  *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
- *		if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *		if the verdict eBPF program returns **SK_PASS**), redirect it
  *		to the socket referenced by *map* (of type
  *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
  *		egress interfaces can be used for redirection. The
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c446394135be..d83561e8cd2c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2253,7 +2253,7 @@ union bpf_attr {
  *	Description
  *		This helper is used in programs implementing policies at the
  *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
- *		if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *		if the verdict eBPF program returns **SK_PASS**), redirect it
  *		to the socket referenced by *map* (of type
  *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
  *		egress interfaces can be used for redirection. The

From 80348d8867c65e2c64ca78d82326e889550a24b6 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 7 Oct 2020 13:42:26 +0200
Subject: [PATCH 31/62] libbpf: Fix compatibility problem in xsk_socket__create

Fix a compatibility problem when the old XDP_SHARED_UMEM mode is used
together with the xsk_socket__create() call. In the old XDP_SHARED_UMEM
mode, only sharing of the same device and queue id was allowed, and
in this mode, the fill ring and completion ring were shared between
the AF_XDP sockets.

Therefore, it was perfectly fine to call the xsk_socket__create() API
for each socket and not use the new xsk_socket__create_shared() API.
This behavior was ruined by the commit introducing XDP_SHARED_UMEM
support between different devices and/or queue ids. This patch restores
the ability to use xsk_socket__create in these circumstances so that
backward compatibility is not broken.

Fixes: 2f6324a3937f ("libbpf: Support shared umems between queues and devices")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/1602070946-11154-1-git-send-email-magnus.karlsson@gmail.com
---
 tools/lib/bpf/xsk.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 30b4ca5d2ac7..e3c98c007825 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -705,7 +705,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 	struct xsk_ctx *ctx;
 	int err, ifindex;
 
-	if (!umem || !xsk_ptr || !(rx || tx) || !fill || !comp)
+	if (!umem || !xsk_ptr || !(rx || tx))
 		return -EFAULT;
 
 	xsk = calloc(1, sizeof(*xsk));
@@ -735,6 +735,11 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 
 	ctx = xsk_get_ctx(umem, ifindex, queue_id);
 	if (!ctx) {
+		if (!fill || !comp) {
+			err = -EFAULT;
+			goto out_socket;
+		}
+
 		ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
 				     fill, comp);
 		if (!ctx) {

From 47f7cf6325f7d38e33d3dc38fa55bbe605f5b335 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 7 Oct 2020 17:10:21 -0700
Subject: [PATCH 32/62] libbpf: Skip CO-RE relocations for not loaded BPF
 programs

Bypass CO-RE relocations step for BPF programs that are not going to be
loaded. This allows to have BPF programs compiled in and disabled dynamically
if kernel is not supposed to provide enough relocation information. In such
case, there won't be unnecessary warnings about failed relocations.

Fixes: d929758101fc ("libbpf: Support disabling auto-loading BPF programs")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201008001025.292064-2-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index faec389c4849..07d62771472f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5765,6 +5765,11 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
 				err = -EINVAL;
 				goto out;
 			}
+			/* no need to apply CO-RE relocation if the program is
+			 * not going to be loaded
+			 */
+			if (!prog->load)
+				continue;
 
 			err = bpf_core_apply_relo(prog, rec, i, obj->btf,
 						  targ_btf, cand_cache);

From a66345bcbdf0c5070479c26103f7cb33da9e4969 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Wed, 7 Oct 2020 17:10:22 -0700
Subject: [PATCH 33/62] libbpf: Support safe subset of load/store instruction
 resizing with CO-RE

Add support for patching instructions of the following form:
  - rX = *(T *)(rY + <off>);
  - *(T *)(rX + <off>) = rY;
  - *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}.

For such instructions, if the actual kernel field recorded in CO-RE relocation
has a different size than the one recorded locally (e.g., from vmlinux.h),
then libbpf will adjust T to an appropriate 1-, 2-, 4-, or 8-byte loads.

In general, such transformation is not always correct and could lead to
invalid final value being loaded or stored. But two classes of cases are
always safe:
  - if both local and target (kernel) types are unsigned integers, but of
  different sizes, then it's OK to adjust load/store instruction according to
  the necessary memory size. Zero-extending nature of such instructions and
  unsignedness make sure that the final value is always correct;
  - pointer size mismatch between BPF target architecture (which is always
  64-bit) and 32-bit host kernel architecture can be similarly resolved
  automatically, because pointer is essentially an unsigned integer. Loading
  32-bit pointer into 64-bit BPF register with zero extension will leave
  correct pointer in the register.

Both cases are necessary to support CO-RE on 32-bit kernels, as `unsigned
long` in vmlinux.h generated from 32-bit kernel is 32-bit, but when compiled
with BPF program for BPF target it will be treated by compiler as 64-bit
integer. Similarly, pointers in vmlinux.h are 32-bit for kernel, but treated
as 64-bit values by compiler for BPF target. Both problems are now resolved by
libbpf for direct memory reads.

But similar transformations are useful in general when kernel fields are
"resized" from, e.g., unsigned int to unsigned long (or vice versa).

Now, similar transformations for signed integers are not safe to perform as
they will result in incorrect sign extension of the value. If such situation
is detected, libbpf will emit helpful message and will poison the instruction.
Not failing immediately means that it's possible to guard the instruction
based on kernel version (or other conditions) and make sure it's not
reachable.

If there is a need to read signed integers that change sizes between different
kernels, it's possible to use BPF_CORE_READ_BITFIELD() macro, which works both
with bitfields and non-bitfield integers of any signedness and handles
sign-extension properly. Also, bpf_core_read() with proper size and/or use of
bpf_core_field_size() relocation could allow to deal with such complicated
situations explicitly, if not so conventiently as direct memory reads.

Selftests added in a separate patch in progs/test_core_autosize.c demonstrate
both direct memory and probed use cases.

BPF_CORE_READ() is not changed and it won't deal with such situations as
automatically as direct memory reads due to the signedness integer
limitations, which are much harder to detect and control with compiler macro
magic. So it's encouraged to utilize direct memory reads as much as possible.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201008001025.292064-3-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 144 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 136 insertions(+), 8 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 07d62771472f..032cf0049ddb 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5040,16 +5040,19 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec,
 static int bpf_core_calc_field_relo(const struct bpf_program *prog,
 				    const struct bpf_core_relo *relo,
 				    const struct bpf_core_spec *spec,
-				    __u32 *val, bool *validate)
+				    __u32 *val, __u32 *field_sz, __u32 *type_id,
+				    bool *validate)
 {
 	const struct bpf_core_accessor *acc;
 	const struct btf_type *t;
-	__u32 byte_off, byte_sz, bit_off, bit_sz;
+	__u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id;
 	const struct btf_member *m;
 	const struct btf_type *mt;
 	bool bitfield;
 	__s64 sz;
 
+	*field_sz = 0;
+
 	if (relo->kind == BPF_FIELD_EXISTS) {
 		*val = spec ? 1 : 0;
 		return 0;
@@ -5065,6 +5068,12 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
 	if (!acc->name) {
 		if (relo->kind == BPF_FIELD_BYTE_OFFSET) {
 			*val = spec->bit_offset / 8;
+			/* remember field size for load/store mem size */
+			sz = btf__resolve_size(spec->btf, acc->type_id);
+			if (sz < 0)
+				return -EINVAL;
+			*field_sz = sz;
+			*type_id = acc->type_id;
 		} else if (relo->kind == BPF_FIELD_BYTE_SIZE) {
 			sz = btf__resolve_size(spec->btf, acc->type_id);
 			if (sz < 0)
@@ -5081,7 +5090,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
 	}
 
 	m = btf_members(t) + acc->idx;
-	mt = skip_mods_and_typedefs(spec->btf, m->type, NULL);
+	mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id);
 	bit_off = spec->bit_offset;
 	bit_sz = btf_member_bitfield_size(t, acc->idx);
 
@@ -5101,7 +5110,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
 			byte_off = bit_off / 8 / byte_sz * byte_sz;
 		}
 	} else {
-		sz = btf__resolve_size(spec->btf, m->type);
+		sz = btf__resolve_size(spec->btf, field_type_id);
 		if (sz < 0)
 			return -EINVAL;
 		byte_sz = sz;
@@ -5119,6 +5128,10 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
 	switch (relo->kind) {
 	case BPF_FIELD_BYTE_OFFSET:
 		*val = byte_off;
+		if (!bitfield) {
+			*field_sz = byte_sz;
+			*type_id = field_type_id;
+		}
 		break;
 	case BPF_FIELD_BYTE_SIZE:
 		*val = byte_sz;
@@ -5219,6 +5232,19 @@ struct bpf_core_relo_res
 	bool poison;
 	/* some relocations can't be validated against orig_val */
 	bool validate;
+	/* for field byte offset relocations or the forms:
+	 *     *(T *)(rX + <off>) = rY
+	 *     rX = *(T *)(rY + <off>),
+	 * we remember original and resolved field size to adjust direct
+	 * memory loads of pointers and integers; this is necessary for 32-bit
+	 * host kernel architectures, but also allows to automatically
+	 * relocate fields that were resized from, e.g., u32 to u64, etc.
+	 */
+	bool fail_memsz_adjust;
+	__u32 orig_sz;
+	__u32 orig_type_id;
+	__u32 new_sz;
+	__u32 new_type_id;
 };
 
 /* Calculate original and target relocation values, given local and target
@@ -5240,10 +5266,56 @@ static int bpf_core_calc_relo(const struct bpf_program *prog,
 	res->new_val = 0;
 	res->poison = false;
 	res->validate = true;
+	res->fail_memsz_adjust = false;
+	res->orig_sz = res->new_sz = 0;
+	res->orig_type_id = res->new_type_id = 0;
 
 	if (core_relo_is_field_based(relo->kind)) {
-		err = bpf_core_calc_field_relo(prog, relo, local_spec, &res->orig_val, &res->validate);
-		err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec, &res->new_val, NULL);
+		err = bpf_core_calc_field_relo(prog, relo, local_spec,
+					       &res->orig_val, &res->orig_sz,
+					       &res->orig_type_id, &res->validate);
+		err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec,
+						      &res->new_val, &res->new_sz,
+						      &res->new_type_id, NULL);
+		if (err)
+			goto done;
+		/* Validate if it's safe to adjust load/store memory size.
+		 * Adjustments are performed only if original and new memory
+		 * sizes differ.
+		 */
+		res->fail_memsz_adjust = false;
+		if (res->orig_sz != res->new_sz) {
+			const struct btf_type *orig_t, *new_t;
+
+			orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id);
+			new_t = btf__type_by_id(targ_spec->btf, res->new_type_id);
+
+			/* There are two use cases in which it's safe to
+			 * adjust load/store's mem size:
+			 *   - reading a 32-bit kernel pointer, while on BPF
+			 *   size pointers are always 64-bit; in this case
+			 *   it's safe to "downsize" instruction size due to
+			 *   pointer being treated as unsigned integer with
+			 *   zero-extended upper 32-bits;
+			 *   - reading unsigned integers, again due to
+			 *   zero-extension is preserving the value correctly.
+			 *
+			 * In all other cases it's incorrect to attempt to
+			 * load/store field because read value will be
+			 * incorrect, so we poison relocated instruction.
+			 */
+			if (btf_is_ptr(orig_t) && btf_is_ptr(new_t))
+				goto done;
+			if (btf_is_int(orig_t) && btf_is_int(new_t) &&
+			    btf_int_encoding(orig_t) != BTF_INT_SIGNED &&
+			    btf_int_encoding(new_t) != BTF_INT_SIGNED)
+				goto done;
+
+			/* mark as invalid mem size adjustment, but this will
+			 * only be checked for LDX/STX/ST insns
+			 */
+			res->fail_memsz_adjust = true;
+		}
 	} else if (core_relo_is_type_based(relo->kind)) {
 		err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val);
 		err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val);
@@ -5252,6 +5324,7 @@ static int bpf_core_calc_relo(const struct bpf_program *prog,
 		err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val);
 	}
 
+done:
 	if (err == -EUCLEAN) {
 		/* EUCLEAN is used to signal instruction poisoning request */
 		res->poison = true;
@@ -5291,6 +5364,28 @@ static bool is_ldimm64(struct bpf_insn *insn)
 	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
 }
 
+static int insn_bpf_size_to_bytes(struct bpf_insn *insn)
+{
+	switch (BPF_SIZE(insn->code)) {
+	case BPF_DW: return 8;
+	case BPF_W: return 4;
+	case BPF_H: return 2;
+	case BPF_B: return 1;
+	default: return -1;
+	}
+}
+
+static int insn_bytes_to_bpf_size(__u32 sz)
+{
+	switch (sz) {
+	case 8: return BPF_DW;
+	case 4: return BPF_W;
+	case 2: return BPF_H;
+	case 1: return BPF_B;
+	default: return -1;
+	}
+}
+
 /*
  * Patch relocatable BPF instruction.
  *
@@ -5300,10 +5395,13 @@ static bool is_ldimm64(struct bpf_insn *insn)
  * spec, and is checked before patching instruction. If actual insn->imm value
  * is wrong, bail out with error.
  *
- * Currently three kinds of BPF instructions are supported:
+ * Currently supported classes of BPF instruction are:
  * 1. rX = <imm> (assignment with immediate operand);
  * 2. rX += <imm> (arithmetic operations with immediate operand);
- * 3. rX = <imm64> (load with 64-bit immediate value).
+ * 3. rX = <imm64> (load with 64-bit immediate value);
+ * 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64};
+ * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64};
+ * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}.
  */
 static int bpf_core_patch_insn(struct bpf_program *prog,
 			       const struct bpf_core_relo *relo,
@@ -5327,6 +5425,7 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 	class = BPF_CLASS(insn->code);
 
 	if (res->poison) {
+poison:
 		/* poison second part of ldimm64 to avoid confusing error from
 		 * verifier about "unknown opcode 00"
 		 */
@@ -5369,10 +5468,39 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 				prog->name, relo_idx, insn_idx, new_val);
 			return -ERANGE;
 		}
+		if (res->fail_memsz_adjust) {
+			pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. "
+				"Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n",
+				prog->name, relo_idx, insn_idx);
+			goto poison;
+		}
+
 		orig_val = insn->off;
 		insn->off = new_val;
 		pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n",
 			 prog->name, relo_idx, insn_idx, orig_val, new_val);
+
+		if (res->new_sz != res->orig_sz) {
+			int insn_bytes_sz, insn_bpf_sz;
+
+			insn_bytes_sz = insn_bpf_size_to_bytes(insn);
+			if (insn_bytes_sz != res->orig_sz) {
+				pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n",
+					prog->name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz);
+				return -EINVAL;
+			}
+
+			insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz);
+			if (insn_bpf_sz < 0) {
+				pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n",
+					prog->name, relo_idx, insn_idx, res->new_sz);
+				return -EINVAL;
+			}
+
+			insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code);
+			pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n",
+				 prog->name, relo_idx, insn_idx, res->orig_sz, res->new_sz);
+		}
 		break;
 	case BPF_LD: {
 		__u64 imm;

From 2b7d88c2b582c659a6567e851d96873d0209a501 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Wed, 7 Oct 2020 17:10:23 -0700
Subject: [PATCH 34/62] libbpf: Allow specifying both ELF and raw BTF for CO-RE
 BTF override

Use generalized BTF parsing logic, making it possible to parse BTF both from
ELF file, as well as a raw BTF dump. This makes it easier to write custom
tests with manually generated BTFs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201008001025.292064-4-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 032cf0049ddb..105cb8ad1d75 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5842,7 +5842,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
 		return 0;
 
 	if (targ_btf_path)
-		targ_btf = btf__parse_elf(targ_btf_path, NULL);
+		targ_btf = btf__parse(targ_btf_path, NULL);
 	else
 		targ_btf = obj->btf_vmlinux;
 	if (IS_ERR_OR_NULL(targ_btf)) {

From 888d83b961f6057aa377065b60c5206fd8bff97c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Wed, 7 Oct 2020 17:10:24 -0700
Subject: [PATCH 35/62] selftests/bpf: Validate libbpf's auto-sizing of
 LD/ST/STX instructions

Add selftests validating libbpf's auto-resizing of load/store instructions
when used with CO-RE relocations. An explicit and manual approach with using
bpf_core_read() is also demonstrated and tested. Separate BPF program is
supposed to fail due to using signed integers of sizes that differ from
kernel's sizes.

To reliably simulate 32-bit BTF (i.e., the one with sizeof(long) ==
sizeof(void *) == 4), selftest generates its own custom BTF and passes it as
a replacement for real kernel BTF. This allows to test 32/64-bitness mix on
all architectures.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201008001025.292064-5-andrii@kernel.org
---
 .../selftests/bpf/prog_tests/core_autosize.c  | 225 ++++++++++++++++++
 .../selftests/bpf/progs/test_core_autosize.c  | 172 +++++++++++++
 2 files changed, 397 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/core_autosize.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_core_autosize.c

diff --git a/tools/testing/selftests/bpf/prog_tests/core_autosize.c b/tools/testing/selftests/bpf/prog_tests/core_autosize.c
new file mode 100644
index 000000000000..981c251453d9
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/core_autosize.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+/* real layout and sizes according to test's (32-bit) BTF
+ * needs to be defined before skeleton is included */
+struct test_struct___real {
+	unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */
+	unsigned int val2;
+	unsigned long long val1;
+	unsigned short val3;
+	unsigned char val4;
+	unsigned char _pad;
+};
+
+#include "test_core_autosize.skel.h"
+
+static int duration = 0;
+
+static struct {
+	unsigned long long ptr_samesized;
+	unsigned long long val1_samesized;
+	unsigned long long val2_samesized;
+	unsigned long long val3_samesized;
+	unsigned long long val4_samesized;
+	struct test_struct___real output_samesized;
+
+	unsigned long long ptr_downsized;
+	unsigned long long val1_downsized;
+	unsigned long long val2_downsized;
+	unsigned long long val3_downsized;
+	unsigned long long val4_downsized;
+	struct test_struct___real output_downsized;
+
+	unsigned long long ptr_probed;
+	unsigned long long val1_probed;
+	unsigned long long val2_probed;
+	unsigned long long val3_probed;
+	unsigned long long val4_probed;
+
+	unsigned long long ptr_signed;
+	unsigned long long val1_signed;
+	unsigned long long val2_signed;
+	unsigned long long val3_signed;
+	unsigned long long val4_signed;
+	struct test_struct___real output_signed;
+} out;
+
+void test_core_autosize(void)
+{
+	char btf_file[] = "/tmp/core_autosize.btf.XXXXXX";
+	int err, fd = -1, zero = 0;
+	int char_id, short_id, int_id, long_long_id, void_ptr_id, id;
+	struct test_core_autosize* skel = NULL;
+	struct bpf_object_load_attr load_attr = {};
+	struct bpf_program *prog;
+	struct bpf_map *bss_map;
+	struct btf *btf = NULL;
+	size_t written;
+	const void *raw_data;
+	__u32 raw_sz;
+	FILE *f = NULL;
+
+	btf = btf__new_empty();
+	if (!ASSERT_OK_PTR(btf, "empty_btf"))
+		return;
+	/* Emit the following struct with 32-bit pointer size:
+	 *
+	 * struct test_struct {
+	 *     void *ptr;
+	 *     unsigned long val2;
+	 *     unsigned long long val1;
+	 *     unsigned short val3;
+	 *     unsigned char val4;
+	 *     char: 8;
+	 * };
+	 *
+	 * This struct is going to be used as the "kernel BTF" for this test.
+	 * It's equivalent memory-layout-wise to test_struct__real above.
+	 */
+
+	/* force 32-bit pointer size */
+	btf__set_pointer_size(btf, 4);
+
+	char_id = btf__add_int(btf, "unsigned char", 1, 0);
+	ASSERT_EQ(char_id, 1, "char_id");
+	short_id = btf__add_int(btf, "unsigned short", 2, 0);
+	ASSERT_EQ(short_id, 2, "short_id");
+	/* "long unsigned int" of 4 byte size tells BTF that sizeof(void *) == 4 */
+	int_id = btf__add_int(btf, "long unsigned int", 4, 0);
+	ASSERT_EQ(int_id, 3, "int_id");
+	long_long_id = btf__add_int(btf, "unsigned long long", 8, 0);
+	ASSERT_EQ(long_long_id, 4, "long_long_id");
+	void_ptr_id = btf__add_ptr(btf, 0);
+	ASSERT_EQ(void_ptr_id, 5, "void_ptr_id");
+
+	id = btf__add_struct(btf, "test_struct", 20 /* bytes */);
+	ASSERT_EQ(id, 6, "struct_id");
+	err = btf__add_field(btf, "ptr", void_ptr_id, 0, 0);
+	err = err ?: btf__add_field(btf, "val2", int_id, 32, 0);
+	err = err ?: btf__add_field(btf, "val1", long_long_id, 64, 0);
+	err = err ?: btf__add_field(btf, "val3", short_id, 128, 0);
+	err = err ?: btf__add_field(btf, "val4", char_id, 144, 0);
+	ASSERT_OK(err, "struct_fields");
+
+	fd = mkstemp(btf_file);
+	if (CHECK(fd < 0, "btf_tmp", "failed to create file: %d\n", fd))
+		goto cleanup;
+	f = fdopen(fd, "w");
+	if (!ASSERT_OK_PTR(f, "btf_fdopen"))
+		goto cleanup;
+
+	raw_data = btf__get_raw_data(btf, &raw_sz);
+	if (!ASSERT_OK_PTR(raw_data, "raw_data"))
+		goto cleanup;
+	written = fwrite(raw_data, 1, raw_sz, f);
+	if (CHECK(written != raw_sz, "btf_write", "written: %zu, errno: %d\n", written, errno))
+		goto cleanup;
+	fflush(f);
+	fclose(f);
+	f = NULL;
+	close(fd);
+	fd = -1;
+
+	/* open and load BPF program with custom BTF as the kernel BTF */
+	skel = test_core_autosize__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	/* disable handle_signed() for now */
+	prog = bpf_object__find_program_by_name(skel->obj, "handle_signed");
+	if (!ASSERT_OK_PTR(prog, "prog_find"))
+		goto cleanup;
+	bpf_program__set_autoload(prog, false);
+
+	load_attr.obj = skel->obj;
+	load_attr.target_btf_path = btf_file;
+	err = bpf_object__load_xattr(&load_attr);
+	if (!ASSERT_OK(err, "prog_load"))
+		goto cleanup;
+
+	prog = bpf_object__find_program_by_name(skel->obj, "handle_samesize");
+	if (!ASSERT_OK_PTR(prog, "prog_find"))
+		goto cleanup;
+	skel->links.handle_samesize = bpf_program__attach(prog);
+	if (!ASSERT_OK_PTR(skel->links.handle_samesize, "prog_attach"))
+		goto cleanup;
+
+	prog = bpf_object__find_program_by_name(skel->obj, "handle_downsize");
+	if (!ASSERT_OK_PTR(prog, "prog_find"))
+		goto cleanup;
+	skel->links.handle_downsize = bpf_program__attach(prog);
+	if (!ASSERT_OK_PTR(skel->links.handle_downsize, "prog_attach"))
+		goto cleanup;
+
+	prog = bpf_object__find_program_by_name(skel->obj, "handle_probed");
+	if (!ASSERT_OK_PTR(prog, "prog_find"))
+		goto cleanup;
+	skel->links.handle_probed = bpf_program__attach(prog);
+	if (!ASSERT_OK_PTR(skel->links.handle_probed, "prog_attach"))
+		goto cleanup;
+
+	usleep(1);
+
+	bss_map = bpf_object__find_map_by_name(skel->obj, "test_cor.bss");
+	if (!ASSERT_OK_PTR(bss_map, "bss_map_find"))
+		goto cleanup;
+
+	err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &zero, (void *)&out);
+	if (!ASSERT_OK(err, "bss_lookup"))
+		goto cleanup;
+
+	ASSERT_EQ(out.ptr_samesized, 0x01020304, "ptr_samesized");
+	ASSERT_EQ(out.val1_samesized, 0x1020304050607080, "val1_samesized");
+	ASSERT_EQ(out.val2_samesized, 0x0a0b0c0d, "val2_samesized");
+	ASSERT_EQ(out.val3_samesized, 0xfeed, "val3_samesized");
+	ASSERT_EQ(out.val4_samesized, 0xb9, "val4_samesized");
+	ASSERT_EQ(out.output_samesized.ptr, 0x01020304, "ptr_samesized");
+	ASSERT_EQ(out.output_samesized.val1, 0x1020304050607080, "val1_samesized");
+	ASSERT_EQ(out.output_samesized.val2, 0x0a0b0c0d, "val2_samesized");
+	ASSERT_EQ(out.output_samesized.val3, 0xfeed, "val3_samesized");
+	ASSERT_EQ(out.output_samesized.val4, 0xb9, "val4_samesized");
+
+	ASSERT_EQ(out.ptr_downsized, 0x01020304, "ptr_downsized");
+	ASSERT_EQ(out.val1_downsized, 0x1020304050607080, "val1_downsized");
+	ASSERT_EQ(out.val2_downsized, 0x0a0b0c0d, "val2_downsized");
+	ASSERT_EQ(out.val3_downsized, 0xfeed, "val3_downsized");
+	ASSERT_EQ(out.val4_downsized, 0xb9, "val4_downsized");
+	ASSERT_EQ(out.output_downsized.ptr, 0x01020304, "ptr_downsized");
+	ASSERT_EQ(out.output_downsized.val1, 0x1020304050607080, "val1_downsized");
+	ASSERT_EQ(out.output_downsized.val2, 0x0a0b0c0d, "val2_downsized");
+	ASSERT_EQ(out.output_downsized.val3, 0xfeed, "val3_downsized");
+	ASSERT_EQ(out.output_downsized.val4, 0xb9, "val4_downsized");
+
+	ASSERT_EQ(out.ptr_probed, 0x01020304, "ptr_probed");
+	ASSERT_EQ(out.val1_probed, 0x1020304050607080, "val1_probed");
+	ASSERT_EQ(out.val2_probed, 0x0a0b0c0d, "val2_probed");
+	ASSERT_EQ(out.val3_probed, 0xfeed, "val3_probed");
+	ASSERT_EQ(out.val4_probed, 0xb9, "val4_probed");
+
+	test_core_autosize__destroy(skel);
+	skel = NULL;
+
+	/* now re-load with handle_signed() enabled, it should fail loading */
+	skel = test_core_autosize__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	load_attr.obj = skel->obj;
+	load_attr.target_btf_path = btf_file;
+	err = bpf_object__load_xattr(&load_attr);
+	if (!ASSERT_ERR(err, "bad_prog_load"))
+		goto cleanup;
+
+cleanup:
+	if (f)
+		fclose(f);
+	if (fd >= 0)
+		close(fd);
+	remove(btf_file);
+	btf__free(btf);
+	test_core_autosize__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_core_autosize.c b/tools/testing/selftests/bpf/progs/test_core_autosize.c
new file mode 100644
index 000000000000..44f5aa2e8956
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_autosize.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+/* fields of exactly the same size */
+struct test_struct___samesize {
+	void *ptr;
+	unsigned long long val1;
+	unsigned int val2;
+	unsigned short val3;
+	unsigned char val4;
+} __attribute((preserve_access_index));
+
+/* unsigned fields that have to be downsized by libbpf */
+struct test_struct___downsize {
+	void *ptr;
+	unsigned long val1;
+	unsigned long val2;
+	unsigned long val3;
+	unsigned long val4;
+	/* total sz: 40 */
+} __attribute__((preserve_access_index));
+
+/* fields with signed integers of wrong size, should be rejected */
+struct test_struct___signed {
+	void *ptr;
+	long val1;
+	long val2;
+	long val3;
+	long val4;
+} __attribute((preserve_access_index));
+
+/* real layout and sizes according to test's (32-bit) BTF */
+struct test_struct___real {
+	unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */
+	unsigned int val2;
+	unsigned long long val1;
+	unsigned short val3;
+	unsigned char val4;
+	unsigned char _pad;
+	/* total sz: 20 */
+};
+
+struct test_struct___real input = {
+	.ptr = 0x01020304,
+	.val1 = 0x1020304050607080,
+	.val2 = 0x0a0b0c0d,
+	.val3 = 0xfeed,
+	.val4 = 0xb9,
+	._pad = 0xff, /* make sure no accidental zeros are present */
+};
+
+unsigned long long ptr_samesized = 0;
+unsigned long long val1_samesized = 0;
+unsigned long long val2_samesized = 0;
+unsigned long long val3_samesized = 0;
+unsigned long long val4_samesized = 0;
+struct test_struct___real output_samesized = {};
+
+unsigned long long ptr_downsized = 0;
+unsigned long long val1_downsized = 0;
+unsigned long long val2_downsized = 0;
+unsigned long long val3_downsized = 0;
+unsigned long long val4_downsized = 0;
+struct test_struct___real output_downsized = {};
+
+unsigned long long ptr_probed = 0;
+unsigned long long val1_probed = 0;
+unsigned long long val2_probed = 0;
+unsigned long long val3_probed = 0;
+unsigned long long val4_probed = 0;
+
+unsigned long long ptr_signed = 0;
+unsigned long long val1_signed = 0;
+unsigned long long val2_signed = 0;
+unsigned long long val3_signed = 0;
+unsigned long long val4_signed = 0;
+struct test_struct___real output_signed = {};
+
+SEC("raw_tp/sys_exit")
+int handle_samesize(void *ctx)
+{
+	struct test_struct___samesize *in = (void *)&input;
+	struct test_struct___samesize *out = (void *)&output_samesized;
+
+	ptr_samesized = (unsigned long long)in->ptr;
+	val1_samesized = in->val1;
+	val2_samesized = in->val2;
+	val3_samesized = in->val3;
+	val4_samesized = in->val4;
+
+	out->ptr = in->ptr;
+	out->val1 = in->val1;
+	out->val2 = in->val2;
+	out->val3 = in->val3;
+	out->val4 = in->val4;
+
+	return 0;
+}
+
+SEC("raw_tp/sys_exit")
+int handle_downsize(void *ctx)
+{
+	struct test_struct___downsize *in = (void *)&input;
+	struct test_struct___downsize *out = (void *)&output_downsized;
+
+	ptr_downsized = (unsigned long long)in->ptr;
+	val1_downsized = in->val1;
+	val2_downsized = in->val2;
+	val3_downsized = in->val3;
+	val4_downsized = in->val4;
+
+	out->ptr = in->ptr;
+	out->val1 = in->val1;
+	out->val2 = in->val2;
+	out->val3 = in->val3;
+	out->val4 = in->val4;
+
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int handle_probed(void *ctx)
+{
+	struct test_struct___downsize *in = (void *)&input;
+	__u64 tmp;
+
+	tmp = 0;
+	bpf_core_read(&tmp, bpf_core_field_size(in->ptr), &in->ptr);
+	ptr_probed = tmp;
+
+	tmp = 0;
+	bpf_core_read(&tmp, bpf_core_field_size(in->val1), &in->val1);
+	val1_probed = tmp;
+
+	tmp = 0;
+	bpf_core_read(&tmp, bpf_core_field_size(in->val2), &in->val2);
+	val2_probed = tmp;
+
+	tmp = 0;
+	bpf_core_read(&tmp, bpf_core_field_size(in->val3), &in->val3);
+	val3_probed = tmp;
+
+	tmp = 0;
+	bpf_core_read(&tmp, bpf_core_field_size(in->val4), &in->val4);
+	val4_probed = tmp;
+
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int handle_signed(void *ctx)
+{
+	struct test_struct___signed *in = (void *)&input;
+	struct test_struct___signed *out = (void *)&output_signed;
+
+	val2_signed = in->val2;
+	val3_signed = in->val3;
+	val4_signed = in->val4;
+
+	out->val2= in->val2;
+	out->val3= in->val3;
+	out->val4= in->val4;
+
+	return 0;
+}

From c3f01fdcedf9459b2a27b22287414ab3a0d5d818 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Thu, 8 Oct 2020 16:12:18 +0200
Subject: [PATCH 36/62] xsk: Introduce padding between ring pointers

Introduce one cache line worth of padding between the producer and
consumer pointers in all the lockless rings. This so that the HW
adjacency prefetcher will not prefetch the consumer pointer when the
producer pointer is used and vice versa. This improves throughput
performance for the l2fwd sample app with 2% on my machine with HW
prefetching turned on.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/1602166338-21378-1-git-send-email-magnus.karlsson@gmail.com
---
 net/xdp/xsk_queue.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index dc1dd5ef70d1..cdb9cf3cd136 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -15,6 +15,10 @@
 
 struct xdp_ring {
 	u32 producer ____cacheline_aligned_in_smp;
+	/* Hinder the adjacent cache prefetcher to prefetch the consumer
+	 * pointer if the producer pointer is touched and vice versa.
+	 */
+	u32 pad ____cacheline_aligned_in_smp;
 	u32 consumer ____cacheline_aligned_in_smp;
 	u32 flags;
 };

From eca43ee6c46db92dd850ce659316b0680d70e137 Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Fri, 9 Oct 2020 07:03:25 +0000
Subject: [PATCH 37/62] bpf: Add tcp_notsent_lowat bpf setsockopt

Adding support for TCP_NOTSENT_LOWAT sockoption (https://lwn.net/Articles/560082/)
in tcp bpf programs.

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20201009070325.226855-1-tehnerd@tehnerd.com
---
 include/uapi/linux/bpf.h                      |  2 +-
 net/core/filter.c                             |  4 ++++
 tools/include/uapi/linux/bpf.h                |  2 +-
 .../selftests/bpf/progs/connect4_prog.c       | 19 +++++++++++++++++++
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d83561e8cd2c..42d2df799397 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1698,7 +1698,7 @@ union bpf_attr {
  * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
  * 		  **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
  * 		  **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
- * 		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**.
+ *		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**.
  * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
  * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
  * 	Return
diff --git a/net/core/filter.c b/net/core/filter.c
index 05df73780dd3..5da44b11e1ec 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4827,6 +4827,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 				else
 					icsk->icsk_user_timeout = val;
 				break;
+			case TCP_NOTSENT_LOWAT:
+				tp->notsent_lowat = val;
+				sk->sk_write_space(sk);
+				break;
 			default:
 				ret = -EINVAL;
 			}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d83561e8cd2c..42d2df799397 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1698,7 +1698,7 @@ union bpf_attr {
  * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
  * 		  **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
  * 		  **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
- * 		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**.
+ *		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**.
  * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
  * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
  * 	Return
diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c
index b1b2773c0b9d..a943d394fd3a 100644
--- a/tools/testing/selftests/bpf/progs/connect4_prog.c
+++ b/tools/testing/selftests/bpf/progs/connect4_prog.c
@@ -23,6 +23,10 @@
 #define TCP_CA_NAME_MAX 16
 #endif
 
+#ifndef TCP_NOTSENT_LOWAT
+#define TCP_NOTSENT_LOWAT 25
+#endif
+
 #ifndef IFNAMSIZ
 #define IFNAMSIZ 16
 #endif
@@ -128,6 +132,18 @@ static __inline int set_keepalive(struct bpf_sock_addr *ctx)
 	return 0;
 }
 
+static __inline int set_notsent_lowat(struct bpf_sock_addr *ctx)
+{
+	int lowat = 65535;
+
+	if (ctx->type == SOCK_STREAM) {
+		if (bpf_setsockopt(ctx, SOL_TCP, TCP_NOTSENT_LOWAT, &lowat, sizeof(lowat)))
+			return 1;
+	}
+
+	return 0;
+}
+
 SEC("cgroup/connect4")
 int connect_v4_prog(struct bpf_sock_addr *ctx)
 {
@@ -148,6 +164,9 @@ int connect_v4_prog(struct bpf_sock_addr *ctx)
 	if (set_keepalive(ctx))
 		return 0;
 
+	if (set_notsent_lowat(ctx))
+		return 0;
+
 	if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM)
 		return 0;
 	else if (ctx->type == SOCK_STREAM)

From 75748837b7e56919679e02163f45d5818c644d03 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 8 Oct 2020 18:12:37 -0700
Subject: [PATCH 38/62] bpf: Propagate scalar ranges through register
 assignments.

The llvm register allocator may use two different registers representing the
same virtual register. In such case the following pattern can be observed:
1047: (bf) r9 = r6
1048: (a5) if r6 < 0x1000 goto pc+1
1050: ...
1051: (a5) if r9 < 0x2 goto pc+66
1052: ...
1053: (bf) r2 = r9 /* r2 needs to have upper and lower bounds */

This is normal behavior of greedy register allocator.
The slides 137+ explain why regalloc introduces such register copy:
http://llvm.org/devmtg/2018-04/slides/Yatsina-LLVM%20Greedy%20Register%20Allocator.pdf
There is no way to tell llvm 'not to do this'.
Hence the verifier has to recognize such patterns.

In order to track this information without backtracking allocate ID
for scalars in a similar way as it's done for find_good_pkt_pointers().

When the verifier encounters r9 = r6 assignment it will assign the same ID
to both registers. Later if either register range is narrowed via conditional
jump propagate the register state into the other register.

Clear register ID in adjust_reg_min_max_vals() for any alu instruction. The
register ID is ignored for scalars in regsafe() and doesn't affect state
pruning. mark_reg_unknown() clears the ID. It's used to process call, endian
and other instructions. Hence ID is explicitly cleared only in
adjust_reg_min_max_vals() and in 32-bit mov.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20201009011240.48506-2-alexei.starovoitov@gmail.com
---
 kernel/bpf/verifier.c                         | 50 +++++++++++++++++++
 .../testing/selftests/bpf/prog_tests/align.c  | 16 +++---
 .../bpf/verifier/direct_packet_access.c       |  2 +-
 3 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 62b804651a48..ba96f7e9bbc0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6436,6 +6436,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	src_reg = NULL;
 	if (dst_reg->type != SCALAR_VALUE)
 		ptr_reg = dst_reg;
+	else
+		/* Make sure ID is cleared otherwise dst_reg min/max could be
+		 * incorrectly propagated into other registers by find_equal_scalars()
+		 */
+		dst_reg->id = 0;
 	if (BPF_SRC(insn->code) == BPF_X) {
 		src_reg = &regs[insn->src_reg];
 		if (src_reg->type != SCALAR_VALUE) {
@@ -6569,6 +6574,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				/* case: R1 = R2
 				 * copy register state to dest reg
 				 */
+				if (src_reg->type == SCALAR_VALUE && !src_reg->id)
+					/* Assign src and dst registers the same ID
+					 * that will be used by find_equal_scalars()
+					 * to propagate min/max range.
+					 */
+					src_reg->id = ++env->id_gen;
 				*dst_reg = *src_reg;
 				dst_reg->live |= REG_LIVE_WRITTEN;
 				dst_reg->subreg_def = DEF_NOT_SUBREG;
@@ -6581,6 +6592,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 					return -EACCES;
 				} else if (src_reg->type == SCALAR_VALUE) {
 					*dst_reg = *src_reg;
+					/* Make sure ID is cleared otherwise
+					 * dst_reg min/max could be incorrectly
+					 * propagated into src_reg by find_equal_scalars()
+					 */
+					dst_reg->id = 0;
 					dst_reg->live |= REG_LIVE_WRITTEN;
 					dst_reg->subreg_def = env->insn_idx + 1;
 				} else {
@@ -7369,6 +7385,30 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 	return true;
 }
 
+static void find_equal_scalars(struct bpf_verifier_state *vstate,
+			       struct bpf_reg_state *known_reg)
+{
+	struct bpf_func_state *state;
+	struct bpf_reg_state *reg;
+	int i, j;
+
+	for (i = 0; i <= vstate->curframe; i++) {
+		state = vstate->frame[i];
+		for (j = 0; j < MAX_BPF_REG; j++) {
+			reg = &state->regs[j];
+			if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+				*reg = *known_reg;
+		}
+
+		bpf_for_each_spilled_reg(j, state, reg) {
+			if (!reg)
+				continue;
+			if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+				*reg = *known_reg;
+		}
+	}
+}
+
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
@@ -7497,6 +7537,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 				reg_combine_min_max(&other_branch_regs[insn->src_reg],
 						    &other_branch_regs[insn->dst_reg],
 						    src_reg, dst_reg, opcode);
+			if (src_reg->id) {
+				find_equal_scalars(this_branch, src_reg);
+				find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+			}
+
 		}
 	} else if (dst_reg->type == SCALAR_VALUE) {
 		reg_set_min_max(&other_branch_regs[insn->dst_reg],
@@ -7504,6 +7549,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 					opcode, is_jmp32);
 	}
 
+	if (dst_reg->type == SCALAR_VALUE && dst_reg->id) {
+		find_equal_scalars(this_branch, dst_reg);
+		find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
+	}
+
 	/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
 	 * NOTE: these optimizations below are related with pointer comparison
 	 *       which will never be JMP32.
diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c
index c548aded6585..52414058a627 100644
--- a/tools/testing/selftests/bpf/prog_tests/align.c
+++ b/tools/testing/selftests/bpf/prog_tests/align.c
@@ -195,13 +195,13 @@ static struct bpf_align_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
 			{7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-			{8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+			{8, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
 			{9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-			{10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+			{10, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
 			{11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
-			{12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+			{12, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
 			{13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-			{14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+			{14, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
 			{15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
 			{16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
 		},
@@ -518,7 +518,7 @@ static struct bpf_align_test tests[] = {
 			 * the total offset is 4-byte aligned and meets the
 			 * load's requirements.
 			 */
-			{20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
+			{20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
 
 		},
 	},
@@ -561,18 +561,18 @@ static struct bpf_align_test tests[] = {
 			/* Adding 14 makes R6 be (4n+2) */
 			{11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
 			/* Subtracting from packet pointer overflows ubounds */
-			{13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
+			{13, "R5_w=pkt(id=2,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
 			/* New unknown value in R7 is (4n), >= 76 */
 			{15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
 			/* Adding it to packet pointer gives nice bounds again */
-			{16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
+			{16, "R5_w=pkt(id=3,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
 			/* At the time the word size load is performed from R5,
 			 * its total fixed offset is NET_IP_ALIGN + reg->off (0)
 			 * which is 2.  Then the variable offset is (4n+2), so
 			 * the total offset is 4-byte aligned and meets the
 			 * load's requirements.
 			 */
-			{20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
+			{20, "R5=pkt(id=3,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
 		},
 	},
 };
diff --git a/tools/testing/selftests/bpf/verifier/direct_packet_access.c b/tools/testing/selftests/bpf/verifier/direct_packet_access.c
index 2c5fbe7bcd27..ae72536603fe 100644
--- a/tools/testing/selftests/bpf/verifier/direct_packet_access.c
+++ b/tools/testing/selftests/bpf/verifier/direct_packet_access.c
@@ -529,7 +529,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "invalid access to packet, off=0 size=8, R5(id=1,off=0,r=0)",
+	.errstr = "invalid access to packet, off=0 size=8, R5(id=2,off=0,r=0)",
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {

From 5689d49b71ad25409e41c6052dd9549fbeaf9bde Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 8 Oct 2020 18:12:38 -0700
Subject: [PATCH 39/62] bpf: Track spill/fill of bounded scalars.

Under register pressure the llvm may spill registers with bounds into the stack.
The verifier has to track them through spill/fill otherwise many kinds of bound
errors will be seen. The spill/fill of induction variables was already
happening. This patch extends this logic from tracking spill/fill of a constant
into any bounded register. There is no need to track spill/fill of unbounded,
since no new information will be retrieved from the stack during register fill.

Though extra stack difference could cause state pruning to be less effective, no
adverse affects were seen from this patch on selftests and on cilium programs.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20201009011240.48506-3-alexei.starovoitov@gmail.com
---
 kernel/bpf/verifier.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ba96f7e9bbc0..f3e36eade3d4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2227,6 +2227,20 @@ static bool register_is_const(struct bpf_reg_state *reg)
 	return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
 }
 
+static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
+{
+	return tnum_is_unknown(reg->var_off) &&
+	       reg->smin_value == S64_MIN && reg->smax_value == S64_MAX &&
+	       reg->umin_value == 0 && reg->umax_value == U64_MAX &&
+	       reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX &&
+	       reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX;
+}
+
+static bool register_is_bounded(struct bpf_reg_state *reg)
+{
+	return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg);
+}
+
 static bool __is_pointer_value(bool allow_ptr_leaks,
 			       const struct bpf_reg_state *reg)
 {
@@ -2278,7 +2292,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
 	if (value_regno >= 0)
 		reg = &cur->regs[value_regno];
 
-	if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
+	if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
 	    !register_is_null(reg) && env->bpf_capable) {
 		if (dst_reg != BPF_REG_FP) {
 			/* The backtracking logic can only recognize explicit

From 03d4d13fab3fa75fbcf09bced5e3c8acf1622969 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 8 Oct 2020 18:12:39 -0700
Subject: [PATCH 40/62] selftests/bpf: Add profiler test

The main purpose of the profiler test to check different llvm generation
patterns to make sure the verifier can load these large programs.

Note that profiler.inc.h test doesn't follow strict kernel coding style.
The code was formatted in the kernel style, but variable declarations are
kept as-is to preserve original llvm IR pattern.

profiler1.c should pass with older and newer llvm

profiler[23].c may fail on older llvm that don't have:
https://reviews.llvm.org/D85570
because llvm may do speculative code motion optimization that
will generate code like this:

// r9 is a pointer to map_value
// r7 is a scalar
17:       bf 96 00 00 00 00 00 00 r6 = r9
18:       0f 76 00 00 00 00 00 00 r6 += r7
19:       a5 07 01 00 01 01 00 00 if r7 < 257 goto +1
20:       bf 96 00 00 00 00 00 00 r6 = r9
// r6 is used here

The verifier will reject such code with the error:
"math between map_value pointer and register with unbounded min value is not allowed"
At insn 18 the r7 is indeed unbounded. The later insn 19 checks the bounds and
the insn 20 undoes map_value addition. It is currently impossible for the
verifier to understand such speculative pointer arithmetic. Hence llvm D85570
addresses it on the compiler side.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20201009011240.48506-4-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/README.rst        |  38 +
 .../selftests/bpf/prog_tests/test_profiler.c  |  72 ++
 tools/testing/selftests/bpf/progs/profiler.h  | 177 ++++
 .../selftests/bpf/progs/profiler.inc.h        | 969 ++++++++++++++++++
 tools/testing/selftests/bpf/progs/profiler1.c |   6 +
 tools/testing/selftests/bpf/progs/profiler2.c |   6 +
 tools/testing/selftests/bpf/progs/profiler3.c |   6 +
 7 files changed, 1274 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_profiler.c
 create mode 100644 tools/testing/selftests/bpf/progs/profiler.h
 create mode 100644 tools/testing/selftests/bpf/progs/profiler.inc.h
 create mode 100644 tools/testing/selftests/bpf/progs/profiler1.c
 create mode 100644 tools/testing/selftests/bpf/progs/profiler2.c
 create mode 100644 tools/testing/selftests/bpf/progs/profiler3.c

diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index 66acfcf15ff2..ac9eda830187 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -7,6 +7,44 @@ General instructions on running selftests can be found in
 Additional information about selftest failures are
 documented here.
 
+profiler[23] test failures with clang/llvm <12.0.0
+==================================================
+
+With clang/llvm <12.0.0, the profiler[23] test may fail.
+The symptom looks like
+
+.. code-block:: c
+
+  // r9 is a pointer to map_value
+  // r7 is a scalar
+  17:       bf 96 00 00 00 00 00 00 r6 = r9
+  18:       0f 76 00 00 00 00 00 00 r6 += r7
+  math between map_value pointer and register with unbounded min value is not allowed
+
+  // the instructions below will not be seen in the verifier log
+  19:       a5 07 01 00 01 01 00 00 if r7 < 257 goto +1
+  20:       bf 96 00 00 00 00 00 00 r6 = r9
+  // r6 is used here
+
+The verifier will reject such code with above error.
+At insn 18 the r7 is indeed unbounded. The later insn 19 checks the bounds and
+the insn 20 undoes map_value addition. It is currently impossible for the
+verifier to understand such speculative pointer arithmetic.
+Hence
+    https://reviews.llvm.org/D85570
+addresses it on the compiler side. It was committed on llvm 12.
+
+The corresponding C code
+.. code-block:: c
+
+  for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
+          filepart_length = bpf_probe_read_str(payload, ...);
+          if (filepart_length <= MAX_PATH) {
+                  barrier_var(filepart_length); // workaround
+                  payload += filepart_length;
+          }
+  }
+
 bpf_iter test failures with clang/llvm 10.0.0
 =============================================
 
diff --git a/tools/testing/selftests/bpf/prog_tests/test_profiler.c b/tools/testing/selftests/bpf/prog_tests/test_profiler.c
new file mode 100644
index 000000000000..4ca275101ee0
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_profiler.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <test_progs.h>
+#include "progs/profiler.h"
+#include "profiler1.skel.h"
+#include "profiler2.skel.h"
+#include "profiler3.skel.h"
+
+static int sanity_run(struct bpf_program *prog)
+{
+	struct bpf_prog_test_run_attr test_attr = {};
+	__u64 args[] = {1, 2, 3};
+	__u32 duration = 0;
+	int err, prog_fd;
+
+	prog_fd = bpf_program__fd(prog);
+	test_attr.prog_fd = prog_fd;
+	test_attr.ctx_in = args;
+	test_attr.ctx_size_in = sizeof(args);
+	err = bpf_prog_test_run_xattr(&test_attr);
+	if (CHECK(err || test_attr.retval, "test_run",
+		  "err %d errno %d retval %d duration %d\n",
+		  err, errno, test_attr.retval, duration))
+		return -1;
+	return 0;
+}
+
+void test_test_profiler(void)
+{
+	struct profiler1 *profiler1_skel = NULL;
+	struct profiler2 *profiler2_skel = NULL;
+	struct profiler3 *profiler3_skel = NULL;
+	__u32 duration = 0;
+	int err;
+
+	profiler1_skel = profiler1__open_and_load();
+	if (CHECK(!profiler1_skel, "profiler1_skel_load", "profiler1 skeleton failed\n"))
+		goto cleanup;
+
+	err = profiler1__attach(profiler1_skel);
+	if (CHECK(err, "profiler1_attach", "profiler1 attach failed: %d\n", err))
+		goto cleanup;
+
+	if (sanity_run(profiler1_skel->progs.raw_tracepoint__sched_process_exec))
+		goto cleanup;
+
+	profiler2_skel = profiler2__open_and_load();
+	if (CHECK(!profiler2_skel, "profiler2_skel_load", "profiler2 skeleton failed\n"))
+		goto cleanup;
+
+	err = profiler2__attach(profiler2_skel);
+	if (CHECK(err, "profiler2_attach", "profiler2 attach failed: %d\n", err))
+		goto cleanup;
+
+	if (sanity_run(profiler2_skel->progs.raw_tracepoint__sched_process_exec))
+		goto cleanup;
+
+	profiler3_skel = profiler3__open_and_load();
+	if (CHECK(!profiler3_skel, "profiler3_skel_load", "profiler3 skeleton failed\n"))
+		goto cleanup;
+
+	err = profiler3__attach(profiler3_skel);
+	if (CHECK(err, "profiler3_attach", "profiler3 attach failed: %d\n", err))
+		goto cleanup;
+
+	if (sanity_run(profiler3_skel->progs.raw_tracepoint__sched_process_exec))
+		goto cleanup;
+cleanup:
+	profiler1__destroy(profiler1_skel);
+	profiler2__destroy(profiler2_skel);
+	profiler3__destroy(profiler3_skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/profiler.h b/tools/testing/selftests/bpf/progs/profiler.h
new file mode 100644
index 000000000000..3bac4fdd4bdf
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/profiler.h
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#pragma once
+
+#define TASK_COMM_LEN 16
+#define MAX_ANCESTORS 4
+#define MAX_PATH 256
+#define KILL_TARGET_LEN 64
+#define CTL_MAXNAME 10
+#define MAX_ARGS_LEN 4096
+#define MAX_FILENAME_LEN 512
+#define MAX_ENVIRON_LEN 8192
+#define MAX_PATH_DEPTH 32
+#define MAX_FILEPATH_LENGTH (MAX_PATH_DEPTH * MAX_PATH)
+#define MAX_CGROUPS_PATH_DEPTH 8
+
+#define MAX_METADATA_PAYLOAD_LEN TASK_COMM_LEN
+
+#define MAX_CGROUP_PAYLOAD_LEN \
+	(MAX_PATH * 2 + (MAX_PATH * MAX_CGROUPS_PATH_DEPTH))
+
+#define MAX_CAP_PAYLOAD_LEN (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN)
+
+#define MAX_SYSCTL_PAYLOAD_LEN \
+	(MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + CTL_MAXNAME + MAX_PATH)
+
+#define MAX_KILL_PAYLOAD_LEN \
+	(MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + TASK_COMM_LEN + \
+	 KILL_TARGET_LEN)
+
+#define MAX_EXEC_PAYLOAD_LEN \
+	(MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILENAME_LEN + \
+	 MAX_ARGS_LEN + MAX_ENVIRON_LEN)
+
+#define MAX_FILEMOD_PAYLOAD_LEN \
+	(MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILEPATH_LENGTH + \
+	 MAX_FILEPATH_LENGTH)
+
+enum data_type {
+	INVALID_EVENT,
+	EXEC_EVENT,
+	FORK_EVENT,
+	KILL_EVENT,
+	SYSCTL_EVENT,
+	FILEMOD_EVENT,
+	MAX_DATA_TYPE_EVENT
+};
+
+enum filemod_type {
+	FMOD_OPEN,
+	FMOD_LINK,
+	FMOD_SYMLINK,
+};
+
+struct ancestors_data_t {
+	pid_t ancestor_pids[MAX_ANCESTORS];
+	uint32_t ancestor_exec_ids[MAX_ANCESTORS];
+	uint64_t ancestor_start_times[MAX_ANCESTORS];
+	uint32_t num_ancestors;
+};
+
+struct var_metadata_t {
+	enum data_type type;
+	pid_t pid;
+	uint32_t exec_id;
+	uid_t uid;
+	gid_t gid;
+	uint64_t start_time;
+	uint32_t cpu_id;
+	uint64_t bpf_stats_num_perf_events;
+	uint64_t bpf_stats_start_ktime_ns;
+	uint8_t comm_length;
+};
+
+struct cgroup_data_t {
+	ino_t cgroup_root_inode;
+	ino_t cgroup_proc_inode;
+	uint64_t cgroup_root_mtime;
+	uint64_t cgroup_proc_mtime;
+	uint16_t cgroup_root_length;
+	uint16_t cgroup_proc_length;
+	uint16_t cgroup_full_length;
+	int cgroup_full_path_root_pos;
+};
+
+struct var_sysctl_data_t {
+	struct var_metadata_t meta;
+	struct cgroup_data_t cgroup_data;
+	struct ancestors_data_t ancestors_info;
+	uint8_t sysctl_val_length;
+	uint16_t sysctl_path_length;
+	char payload[MAX_SYSCTL_PAYLOAD_LEN];
+};
+
+struct var_kill_data_t {
+	struct var_metadata_t meta;
+	struct cgroup_data_t cgroup_data;
+	struct ancestors_data_t ancestors_info;
+	pid_t kill_target_pid;
+	int kill_sig;
+	uint32_t kill_count;
+	uint64_t last_kill_time;
+	uint8_t kill_target_name_length;
+	uint8_t kill_target_cgroup_proc_length;
+	char payload[MAX_KILL_PAYLOAD_LEN];
+	size_t payload_length;
+};
+
+struct var_exec_data_t {
+	struct var_metadata_t meta;
+	struct cgroup_data_t cgroup_data;
+	pid_t parent_pid;
+	uint32_t parent_exec_id;
+	uid_t parent_uid;
+	uint64_t parent_start_time;
+	uint16_t bin_path_length;
+	uint16_t cmdline_length;
+	uint16_t environment_length;
+	char payload[MAX_EXEC_PAYLOAD_LEN];
+};
+
+struct var_fork_data_t {
+	struct var_metadata_t meta;
+	pid_t parent_pid;
+	uint32_t parent_exec_id;
+	uint64_t parent_start_time;
+	char payload[MAX_METADATA_PAYLOAD_LEN];
+};
+
+struct var_filemod_data_t {
+	struct var_metadata_t meta;
+	struct cgroup_data_t cgroup_data;
+	enum filemod_type fmod_type;
+	unsigned int dst_flags;
+	uint32_t src_device_id;
+	uint32_t dst_device_id;
+	ino_t src_inode;
+	ino_t dst_inode;
+	uint16_t src_filepath_length;
+	uint16_t dst_filepath_length;
+	char payload[MAX_FILEMOD_PAYLOAD_LEN];
+};
+
+struct profiler_config_struct {
+	bool fetch_cgroups_from_bpf;
+	ino_t cgroup_fs_inode;
+	ino_t cgroup_login_session_inode;
+	uint64_t kill_signals_mask;
+	ino_t inode_filter;
+	uint32_t stale_info_secs;
+	bool use_variable_buffers;
+	bool read_environ_from_exec;
+	bool enable_cgroup_v1_resolver;
+};
+
+struct bpf_func_stats_data {
+	uint64_t time_elapsed_ns;
+	uint64_t num_executions;
+	uint64_t num_perf_events;
+};
+
+struct bpf_func_stats_ctx {
+	uint64_t start_time_ns;
+	struct bpf_func_stats_data* bpf_func_stats_data_val;
+};
+
+enum bpf_function_id {
+	profiler_bpf_proc_sys_write,
+	profiler_bpf_sched_process_exec,
+	profiler_bpf_sched_process_exit,
+	profiler_bpf_sys_enter_kill,
+	profiler_bpf_do_filp_open_ret,
+	profiler_bpf_sched_process_fork,
+	profiler_bpf_vfs_link,
+	profiler_bpf_vfs_symlink,
+	profiler_bpf_max_function_id
+};
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
new file mode 100644
index 000000000000..00578311a423
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -0,0 +1,969 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <vmlinux.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "profiler.h"
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define O_WRONLY 00000001
+#define O_RDWR 00000002
+#define O_DIRECTORY 00200000
+#define __O_TMPFILE 020000000
+#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
+#define MAX_ERRNO 4095
+#define S_IFMT 00170000
+#define S_IFSOCK 0140000
+#define S_IFLNK 0120000
+#define S_IFREG 0100000
+#define S_IFBLK 0060000
+#define S_IFDIR 0040000
+#define S_IFCHR 0020000
+#define S_IFIFO 0010000
+#define S_ISUID 0004000
+#define S_ISGID 0002000
+#define S_ISVTX 0001000
+#define S_ISLNK(m) (((m)&S_IFMT) == S_IFLNK)
+#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR)
+#define S_ISCHR(m) (((m)&S_IFMT) == S_IFCHR)
+#define S_ISBLK(m) (((m)&S_IFMT) == S_IFBLK)
+#define S_ISFIFO(m) (((m)&S_IFMT) == S_IFIFO)
+#define S_ISSOCK(m) (((m)&S_IFMT) == S_IFSOCK)
+#define IS_ERR_VALUE(x) (unsigned long)(void*)(x) >= (unsigned long)-MAX_ERRNO
+
+#define KILL_DATA_ARRAY_SIZE 8
+
+struct var_kill_data_arr_t {
+	struct var_kill_data_t array[KILL_DATA_ARRAY_SIZE];
+};
+
+union any_profiler_data_t {
+	struct var_exec_data_t var_exec;
+	struct var_kill_data_t var_kill;
+	struct var_sysctl_data_t var_sysctl;
+	struct var_filemod_data_t var_filemod;
+	struct var_fork_data_t var_fork;
+	struct var_kill_data_arr_t var_kill_data_arr;
+};
+
+volatile struct profiler_config_struct bpf_config = {};
+
+#define FETCH_CGROUPS_FROM_BPF (bpf_config.fetch_cgroups_from_bpf)
+#define CGROUP_FS_INODE (bpf_config.cgroup_fs_inode)
+#define CGROUP_LOGIN_SESSION_INODE \
+	(bpf_config.cgroup_login_session_inode)
+#define KILL_SIGNALS (bpf_config.kill_signals_mask)
+#define STALE_INFO (bpf_config.stale_info_secs)
+#define INODE_FILTER (bpf_config.inode_filter)
+#define READ_ENVIRON_FROM_EXEC (bpf_config.read_environ_from_exec)
+#define ENABLE_CGROUP_V1_RESOLVER (bpf_config.enable_cgroup_v1_resolver)
+
+struct kernfs_iattrs___52 {
+	struct iattr ia_iattr;
+};
+
+struct kernfs_node___52 {
+	union /* kernfs_node_id */ {
+		struct {
+			u32 ino;
+			u32 generation;
+		};
+		u64 id;
+	} id;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, union any_profiler_data_t);
+} data_heap SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(int));
+} events SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, KILL_DATA_ARRAY_SIZE);
+	__type(key, u32);
+	__type(value, struct var_kill_data_arr_t);
+} var_tpid_to_data SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, profiler_bpf_max_function_id);
+	__type(key, u32);
+	__type(value, struct bpf_func_stats_data);
+} bpf_func_stats SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, u32);
+	__type(value, bool);
+	__uint(max_entries, 16);
+} allowed_devices SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, u64);
+	__type(value, bool);
+	__uint(max_entries, 1024);
+} allowed_file_inodes SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, u64);
+	__type(value, bool);
+	__uint(max_entries, 1024);
+} allowed_directory_inodes SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, u32);
+	__type(value, bool);
+	__uint(max_entries, 16);
+} disallowed_exec_inodes SEC(".maps");
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+#endif
+
+static INLINE bool IS_ERR(const void* ptr)
+{
+	return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static INLINE u32 get_userspace_pid()
+{
+	return bpf_get_current_pid_tgid() >> 32;
+}
+
+static INLINE bool is_init_process(u32 tgid)
+{
+	return tgid == 1 || tgid == 0;
+}
+
+static INLINE unsigned long
+probe_read_lim(void* dst, void* src, unsigned long len, unsigned long max)
+{
+	len = len < max ? len : max;
+	if (len > 1) {
+		if (bpf_probe_read(dst, len, src))
+			return 0;
+	} else if (len == 1) {
+		if (bpf_probe_read(dst, 1, src))
+			return 0;
+	}
+	return len;
+}
+
+static INLINE int get_var_spid_index(struct var_kill_data_arr_t* arr_struct,
+				     int spid)
+{
+#ifdef UNROLL
+#pragma unroll
+#endif
+	for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
+		if (arr_struct->array[i].meta.pid == spid)
+			return i;
+	return -1;
+}
+
+static INLINE void populate_ancestors(struct task_struct* task,
+				      struct ancestors_data_t* ancestors_data)
+{
+	struct task_struct* parent = task;
+	u32 num_ancestors, ppid;
+
+	ancestors_data->num_ancestors = 0;
+#ifdef UNROLL
+#pragma unroll
+#endif
+	for (num_ancestors = 0; num_ancestors < MAX_ANCESTORS; num_ancestors++) {
+		parent = BPF_CORE_READ(parent, real_parent);
+		if (parent == NULL)
+			break;
+		ppid = BPF_CORE_READ(parent, tgid);
+		if (is_init_process(ppid))
+			break;
+		ancestors_data->ancestor_pids[num_ancestors] = ppid;
+		ancestors_data->ancestor_exec_ids[num_ancestors] =
+			BPF_CORE_READ(parent, self_exec_id);
+		ancestors_data->ancestor_start_times[num_ancestors] =
+			BPF_CORE_READ(parent, start_time);
+		ancestors_data->num_ancestors = num_ancestors;
+	}
+}
+
+static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node,
+					  struct kernfs_node* cgroup_root_node,
+					  void* payload,
+					  int* root_pos)
+{
+	void* payload_start = payload;
+	size_t filepart_length;
+
+#ifdef UNROLL
+#pragma unroll
+#endif
+	for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
+		filepart_length =
+			bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(cgroup_node, name));
+		if (!cgroup_node)
+			return payload;
+		if (cgroup_node == cgroup_root_node)
+			*root_pos = payload - payload_start;
+		if (filepart_length <= MAX_PATH) {
+			barrier_var(filepart_length);
+			payload += filepart_length;
+		}
+		cgroup_node = BPF_CORE_READ(cgroup_node, parent);
+	}
+	return payload;
+}
+
+static ino_t get_inode_from_kernfs(struct kernfs_node* node)
+{
+	struct kernfs_node___52* node52 = (void*)node;
+
+	if (bpf_core_field_exists(node52->id.ino)) {
+		barrier_var(node52);
+		return BPF_CORE_READ(node52, id.ino);
+	} else {
+		barrier_var(node);
+		return (u64)BPF_CORE_READ(node, id);
+	}
+}
+
+int pids_cgrp_id = 1;
+
+static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data,
+					 struct task_struct* task,
+					 void* payload)
+{
+	struct kernfs_node* root_kernfs =
+		BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn);
+	struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn);
+
+	if (ENABLE_CGROUP_V1_RESOLVER) {
+#ifdef UNROLL
+#pragma unroll
+#endif
+		for (int i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+			struct cgroup_subsys_state* subsys =
+				BPF_CORE_READ(task, cgroups, subsys[i]);
+			if (subsys != NULL) {
+				int subsys_id = BPF_CORE_READ(subsys, ss, id);
+				if (subsys_id == pids_cgrp_id) {
+					proc_kernfs = BPF_CORE_READ(subsys, cgroup, kn);
+					root_kernfs = BPF_CORE_READ(subsys, ss, root, kf_root, kn);
+					break;
+				}
+			}
+		}
+	}
+
+	cgroup_data->cgroup_root_inode = get_inode_from_kernfs(root_kernfs);
+	cgroup_data->cgroup_proc_inode = get_inode_from_kernfs(proc_kernfs);
+
+	if (bpf_core_field_exists(root_kernfs->iattr->ia_mtime)) {
+		cgroup_data->cgroup_root_mtime =
+			BPF_CORE_READ(root_kernfs, iattr, ia_mtime.tv_nsec);
+		cgroup_data->cgroup_proc_mtime =
+			BPF_CORE_READ(proc_kernfs, iattr, ia_mtime.tv_nsec);
+	} else {
+		struct kernfs_iattrs___52* root_iattr =
+			(struct kernfs_iattrs___52*)BPF_CORE_READ(root_kernfs, iattr);
+		cgroup_data->cgroup_root_mtime =
+			BPF_CORE_READ(root_iattr, ia_iattr.ia_mtime.tv_nsec);
+
+		struct kernfs_iattrs___52* proc_iattr =
+			(struct kernfs_iattrs___52*)BPF_CORE_READ(proc_kernfs, iattr);
+		cgroup_data->cgroup_proc_mtime =
+			BPF_CORE_READ(proc_iattr, ia_iattr.ia_mtime.tv_nsec);
+	}
+
+	cgroup_data->cgroup_root_length = 0;
+	cgroup_data->cgroup_proc_length = 0;
+	cgroup_data->cgroup_full_length = 0;
+
+	size_t cgroup_root_length =
+		bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(root_kernfs, name));
+	barrier_var(cgroup_root_length);
+	if (cgroup_root_length <= MAX_PATH) {
+		barrier_var(cgroup_root_length);
+		cgroup_data->cgroup_root_length = cgroup_root_length;
+		payload += cgroup_root_length;
+	}
+
+	size_t cgroup_proc_length =
+		bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(proc_kernfs, name));
+	barrier_var(cgroup_proc_length);
+	if (cgroup_proc_length <= MAX_PATH) {
+		barrier_var(cgroup_proc_length);
+		cgroup_data->cgroup_proc_length = cgroup_proc_length;
+		payload += cgroup_proc_length;
+	}
+
+	if (FETCH_CGROUPS_FROM_BPF) {
+		cgroup_data->cgroup_full_path_root_pos = -1;
+		void* payload_end_pos = read_full_cgroup_path(proc_kernfs, root_kernfs, payload,
+							      &cgroup_data->cgroup_full_path_root_pos);
+		cgroup_data->cgroup_full_length = payload_end_pos - payload;
+		payload = payload_end_pos;
+	}
+
+	return (void*)payload;
+}
+
+static INLINE void* populate_var_metadata(struct var_metadata_t* metadata,
+					  struct task_struct* task,
+					  u32 pid, void* payload)
+{
+	u64 uid_gid = bpf_get_current_uid_gid();
+
+	metadata->uid = (u32)uid_gid;
+	metadata->gid = uid_gid >> 32;
+	metadata->pid = pid;
+	metadata->exec_id = BPF_CORE_READ(task, self_exec_id);
+	metadata->start_time = BPF_CORE_READ(task, start_time);
+	metadata->comm_length = 0;
+
+	size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm);
+	barrier_var(comm_length);
+	if (comm_length <= TASK_COMM_LEN) {
+		barrier_var(comm_length);
+		metadata->comm_length = comm_length;
+		payload += comm_length;
+	}
+
+	return (void*)payload;
+}
+
+static INLINE struct var_kill_data_t*
+get_var_kill_data(struct pt_regs* ctx, int spid, int tpid, int sig)
+{
+	int zero = 0;
+	struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero);
+
+	if (kill_data == NULL)
+		return NULL;
+	struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+	void* payload = populate_var_metadata(&kill_data->meta, task, spid, kill_data->payload);
+	payload = populate_cgroup_info(&kill_data->cgroup_data, task, payload);
+	size_t payload_length = payload - (void*)kill_data->payload;
+	kill_data->payload_length = payload_length;
+	populate_ancestors(task, &kill_data->ancestors_info);
+	kill_data->meta.type = KILL_EVENT;
+	kill_data->kill_target_pid = tpid;
+	kill_data->kill_sig = sig;
+	kill_data->kill_count = 1;
+	kill_data->last_kill_time = bpf_ktime_get_ns();
+	return kill_data;
+}
+
+static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig)
+{
+	if ((KILL_SIGNALS & (1ULL << sig)) == 0)
+		return 0;
+
+	u32 spid = get_userspace_pid();
+	struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid);
+
+	if (arr_struct == NULL) {
+		struct var_kill_data_t* kill_data = get_var_kill_data(ctx, spid, tpid, sig);
+		int zero = 0;
+
+		if (kill_data == NULL)
+			return 0;
+		arr_struct = bpf_map_lookup_elem(&data_heap, &zero);
+		if (arr_struct == NULL)
+			return 0;
+		bpf_probe_read(&arr_struct->array[0], sizeof(arr_struct->array[0]), kill_data);
+	} else {
+		int index = get_var_spid_index(arr_struct, spid);
+
+		if (index == -1) {
+			struct var_kill_data_t* kill_data =
+				get_var_kill_data(ctx, spid, tpid, sig);
+			if (kill_data == NULL)
+				return 0;
+#ifdef UNROLL
+#pragma unroll
+#endif
+			for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
+				if (arr_struct->array[i].meta.pid == 0) {
+					bpf_probe_read(&arr_struct->array[i],
+						       sizeof(arr_struct->array[i]), kill_data);
+					bpf_map_update_elem(&var_tpid_to_data, &tpid,
+							    arr_struct, 0);
+
+					return 0;
+				}
+			return 0;
+		}
+
+		struct var_kill_data_t* kill_data = &arr_struct->array[index];
+
+		u64 delta_sec =
+			(bpf_ktime_get_ns() - kill_data->last_kill_time) / 1000000000;
+
+		if (delta_sec < STALE_INFO) {
+			kill_data->kill_count++;
+			kill_data->last_kill_time = bpf_ktime_get_ns();
+			bpf_probe_read(&arr_struct->array[index],
+				       sizeof(arr_struct->array[index]),
+				       kill_data);
+		} else {
+			struct var_kill_data_t* kill_data =
+				get_var_kill_data(ctx, spid, tpid, sig);
+			if (kill_data == NULL)
+				return 0;
+			bpf_probe_read(&arr_struct->array[index],
+				       sizeof(arr_struct->array[index]),
+				       kill_data);
+		}
+	}
+	bpf_map_update_elem(&var_tpid_to_data, &tpid, arr_struct, 0);
+	return 0;
+}
+
+static INLINE void bpf_stats_enter(struct bpf_func_stats_ctx* bpf_stat_ctx,
+				   enum bpf_function_id func_id)
+{
+	int func_id_key = func_id;
+
+	bpf_stat_ctx->start_time_ns = bpf_ktime_get_ns();
+	bpf_stat_ctx->bpf_func_stats_data_val =
+		bpf_map_lookup_elem(&bpf_func_stats, &func_id_key);
+	if (bpf_stat_ctx->bpf_func_stats_data_val)
+		bpf_stat_ctx->bpf_func_stats_data_val->num_executions++;
+}
+
+static INLINE void bpf_stats_exit(struct bpf_func_stats_ctx* bpf_stat_ctx)
+{
+	if (bpf_stat_ctx->bpf_func_stats_data_val)
+		bpf_stat_ctx->bpf_func_stats_data_val->time_elapsed_ns +=
+			bpf_ktime_get_ns() - bpf_stat_ctx->start_time_ns;
+}
+
+static INLINE void
+bpf_stats_pre_submit_var_perf_event(struct bpf_func_stats_ctx* bpf_stat_ctx,
+				    struct var_metadata_t* meta)
+{
+	if (bpf_stat_ctx->bpf_func_stats_data_val) {
+		bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events++;
+		meta->bpf_stats_num_perf_events =
+			bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events;
+	}
+	meta->bpf_stats_start_ktime_ns = bpf_stat_ctx->start_time_ns;
+	meta->cpu_id = bpf_get_smp_processor_id();
+}
+
+static INLINE size_t
+read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload)
+{
+	size_t length = 0;
+	size_t filepart_length;
+	struct dentry* parent_dentry;
+
+#ifdef UNROLL
+#pragma unroll
+#endif
+	for (int i = 0; i < MAX_PATH_DEPTH; i++) {
+		filepart_length = bpf_probe_read_str(payload, MAX_PATH,
+						     BPF_CORE_READ(filp_dentry, d_name.name));
+		barrier_var(filepart_length);
+		if (filepart_length > MAX_PATH)
+			break;
+		barrier_var(filepart_length);
+		payload += filepart_length;
+		length += filepart_length;
+
+		parent_dentry = BPF_CORE_READ(filp_dentry, d_parent);
+		if (filp_dentry == parent_dentry)
+			break;
+		filp_dentry = parent_dentry;
+	}
+
+	return length;
+}
+
+static INLINE bool
+is_ancestor_in_allowed_inodes(struct dentry* filp_dentry)
+{
+	struct dentry* parent_dentry;
+#ifdef UNROLL
+#pragma unroll
+#endif
+	for (int i = 0; i < MAX_PATH_DEPTH; i++) {
+		u64 dir_ino = BPF_CORE_READ(filp_dentry, d_inode, i_ino);
+		bool* allowed_dir = bpf_map_lookup_elem(&allowed_directory_inodes, &dir_ino);
+
+		if (allowed_dir != NULL)
+			return true;
+		parent_dentry = BPF_CORE_READ(filp_dentry, d_parent);
+		if (filp_dentry == parent_dentry)
+			break;
+		filp_dentry = parent_dentry;
+	}
+	return false;
+}
+
+static INLINE bool is_dentry_allowed_for_filemod(struct dentry* file_dentry,
+						 u32* device_id,
+						 u64* file_ino)
+{
+	u32 dev_id = BPF_CORE_READ(file_dentry, d_sb, s_dev);
+	*device_id = dev_id;
+	bool* allowed_device = bpf_map_lookup_elem(&allowed_devices, &dev_id);
+
+	if (allowed_device == NULL)
+		return false;
+
+	u64 ino = BPF_CORE_READ(file_dentry, d_inode, i_ino);
+	*file_ino = ino;
+	bool* allowed_file = bpf_map_lookup_elem(&allowed_file_inodes, &ino);
+
+	if (allowed_file == NULL)
+		if (!is_ancestor_in_allowed_inodes(BPF_CORE_READ(file_dentry, d_parent)))
+			return false;
+	return true;
+}
+
+SEC("kprobe/proc_sys_write")
+ssize_t BPF_KPROBE(kprobe__proc_sys_write,
+		   struct file* filp, const char* buf,
+		   size_t count, loff_t* ppos)
+{
+	struct bpf_func_stats_ctx stats_ctx;
+	bpf_stats_enter(&stats_ctx, profiler_bpf_proc_sys_write);
+
+	u32 pid = get_userspace_pid();
+	int zero = 0;
+	struct var_sysctl_data_t* sysctl_data =
+		bpf_map_lookup_elem(&data_heap, &zero);
+	if (!sysctl_data)
+		goto out;
+
+	struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+	sysctl_data->meta.type = SYSCTL_EVENT;
+	void* payload = populate_var_metadata(&sysctl_data->meta, task, pid, sysctl_data->payload);
+	payload = populate_cgroup_info(&sysctl_data->cgroup_data, task, payload);
+
+	populate_ancestors(task, &sysctl_data->ancestors_info);
+
+	sysctl_data->sysctl_val_length = 0;
+	sysctl_data->sysctl_path_length = 0;
+
+	size_t sysctl_val_length = bpf_probe_read_str(payload, CTL_MAXNAME, buf);
+	barrier_var(sysctl_val_length);
+	if (sysctl_val_length <= CTL_MAXNAME) {
+		barrier_var(sysctl_val_length);
+		sysctl_data->sysctl_val_length = sysctl_val_length;
+		payload += sysctl_val_length;
+	}
+
+	size_t sysctl_path_length = bpf_probe_read_str(payload, MAX_PATH,
+						       BPF_CORE_READ(filp, f_path.dentry, d_name.name));
+	barrier_var(sysctl_path_length);
+	if (sysctl_path_length <= MAX_PATH) {
+		barrier_var(sysctl_path_length);
+		sysctl_data->sysctl_path_length = sysctl_path_length;
+		payload += sysctl_path_length;
+	}
+
+	bpf_stats_pre_submit_var_perf_event(&stats_ctx, &sysctl_data->meta);
+	unsigned long data_len = payload - (void*)sysctl_data;
+	data_len = data_len > sizeof(struct var_sysctl_data_t)
+		? sizeof(struct var_sysctl_data_t)
+		: data_len;
+	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, sysctl_data, data_len);
+out:
+	bpf_stats_exit(&stats_ctx);
+	return 0;
+}
+
+SEC("tracepoint/syscalls/sys_enter_kill")
+int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx)
+{
+	struct bpf_func_stats_ctx stats_ctx;
+
+	bpf_stats_enter(&stats_ctx, profiler_bpf_sys_enter_kill);
+	int pid = ctx->args[0];
+	int sig = ctx->args[1];
+	int ret = trace_var_sys_kill(ctx, pid, sig);
+	bpf_stats_exit(&stats_ctx);
+	return ret;
+};
+
+SEC("raw_tracepoint/sched_process_exit")
+int raw_tracepoint__sched_process_exit(void* ctx)
+{
+	int zero = 0;
+	struct bpf_func_stats_ctx stats_ctx;
+	bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exit);
+
+	u32 tpid = get_userspace_pid();
+
+	struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid);
+	struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero);
+
+	if (arr_struct == NULL || kill_data == NULL)
+		goto out;
+
+	struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+	struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn);
+
+#ifdef UNROLL
+#pragma unroll
+#endif
+	for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) {
+		struct var_kill_data_t* past_kill_data = &arr_struct->array[i];
+
+		if (past_kill_data != NULL && past_kill_data->kill_target_pid == tpid) {
+			bpf_probe_read(kill_data, sizeof(*past_kill_data), past_kill_data);
+			void* payload = kill_data->payload;
+			size_t offset = kill_data->payload_length;
+			if (offset >= MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN)
+				return 0;
+			payload += offset;
+
+			kill_data->kill_target_name_length = 0;
+			kill_data->kill_target_cgroup_proc_length = 0;
+
+			size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm);
+			barrier_var(comm_length);
+			if (comm_length <= TASK_COMM_LEN) {
+				barrier_var(comm_length);
+				kill_data->kill_target_name_length = comm_length;
+				payload += comm_length;
+			}
+
+			size_t cgroup_proc_length = bpf_probe_read_str(payload, KILL_TARGET_LEN,
+								       BPF_CORE_READ(proc_kernfs, name));
+			barrier_var(cgroup_proc_length);
+			if (cgroup_proc_length <= KILL_TARGET_LEN) {
+				barrier_var(cgroup_proc_length);
+				kill_data->kill_target_cgroup_proc_length = cgroup_proc_length;
+				payload += cgroup_proc_length;
+			}
+
+			bpf_stats_pre_submit_var_perf_event(&stats_ctx, &kill_data->meta);
+			unsigned long data_len = (void*)payload - (void*)kill_data;
+			data_len = data_len > sizeof(struct var_kill_data_t)
+				? sizeof(struct var_kill_data_t)
+				: data_len;
+			bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, kill_data, data_len);
+		}
+	}
+	bpf_map_delete_elem(&var_tpid_to_data, &tpid);
+out:
+	bpf_stats_exit(&stats_ctx);
+	return 0;
+}
+
+SEC("raw_tracepoint/sched_process_exec")
+int raw_tracepoint__sched_process_exec(struct bpf_raw_tracepoint_args* ctx)
+{
+	struct bpf_func_stats_ctx stats_ctx;
+	bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exec);
+
+	struct linux_binprm* bprm = (struct linux_binprm*)ctx->args[2];
+	u64 inode = BPF_CORE_READ(bprm, file, f_inode, i_ino);
+
+	bool* should_filter_binprm = bpf_map_lookup_elem(&disallowed_exec_inodes, &inode);
+	if (should_filter_binprm != NULL)
+		goto out;
+
+	int zero = 0;
+	struct var_exec_data_t* proc_exec_data = bpf_map_lookup_elem(&data_heap, &zero);
+	if (!proc_exec_data)
+		goto out;
+
+	if (INODE_FILTER && inode != INODE_FILTER)
+		return 0;
+
+	u32 pid = get_userspace_pid();
+	struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+	proc_exec_data->meta.type = EXEC_EVENT;
+	proc_exec_data->bin_path_length = 0;
+	proc_exec_data->cmdline_length = 0;
+	proc_exec_data->environment_length = 0;
+	void* payload = populate_var_metadata(&proc_exec_data->meta, task, pid,
+					      proc_exec_data->payload);
+	payload = populate_cgroup_info(&proc_exec_data->cgroup_data, task, payload);
+
+	struct task_struct* parent_task = BPF_CORE_READ(task, real_parent);
+	proc_exec_data->parent_pid = BPF_CORE_READ(parent_task, tgid);
+	proc_exec_data->parent_uid = BPF_CORE_READ(parent_task, real_cred, uid.val);
+	proc_exec_data->parent_exec_id = BPF_CORE_READ(parent_task, self_exec_id);
+	proc_exec_data->parent_start_time = BPF_CORE_READ(parent_task, start_time);
+
+	const char* filename = BPF_CORE_READ(bprm, filename);
+	size_t bin_path_length = bpf_probe_read_str(payload, MAX_FILENAME_LEN, filename);
+	barrier_var(bin_path_length);
+	if (bin_path_length <= MAX_FILENAME_LEN) {
+		barrier_var(bin_path_length);
+		proc_exec_data->bin_path_length = bin_path_length;
+		payload += bin_path_length;
+	}
+
+	void* arg_start = (void*)BPF_CORE_READ(task, mm, arg_start);
+	void* arg_end = (void*)BPF_CORE_READ(task, mm, arg_end);
+	unsigned int cmdline_length = probe_read_lim(payload, arg_start,
+						     arg_end - arg_start, MAX_ARGS_LEN);
+
+	if (cmdline_length <= MAX_ARGS_LEN) {
+		barrier_var(cmdline_length);
+		proc_exec_data->cmdline_length = cmdline_length;
+		payload += cmdline_length;
+	}
+
+	if (READ_ENVIRON_FROM_EXEC) {
+		void* env_start = (void*)BPF_CORE_READ(task, mm, env_start);
+		void* env_end = (void*)BPF_CORE_READ(task, mm, env_end);
+		unsigned long env_len = probe_read_lim(payload, env_start,
+						       env_end - env_start, MAX_ENVIRON_LEN);
+		if (cmdline_length <= MAX_ENVIRON_LEN) {
+			proc_exec_data->environment_length = env_len;
+			payload += env_len;
+		}
+	}
+
+	bpf_stats_pre_submit_var_perf_event(&stats_ctx, &proc_exec_data->meta);
+	unsigned long data_len = payload - (void*)proc_exec_data;
+	data_len = data_len > sizeof(struct var_exec_data_t)
+		? sizeof(struct var_exec_data_t)
+		: data_len;
+	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, proc_exec_data, data_len);
+out:
+	bpf_stats_exit(&stats_ctx);
+	return 0;
+}
+
+SEC("kretprobe/do_filp_open")
+int kprobe_ret__do_filp_open(struct pt_regs* ctx)
+{
+	struct bpf_func_stats_ctx stats_ctx;
+	bpf_stats_enter(&stats_ctx, profiler_bpf_do_filp_open_ret);
+
+	struct file* filp = (struct file*)PT_REGS_RC_CORE(ctx);
+
+	if (filp == NULL || IS_ERR(filp))
+		goto out;
+	unsigned int flags = BPF_CORE_READ(filp, f_flags);
+	if ((flags & (O_RDWR | O_WRONLY)) == 0)
+		goto out;
+	if ((flags & O_TMPFILE) > 0)
+		goto out;
+	struct inode* file_inode = BPF_CORE_READ(filp, f_inode);
+	umode_t mode = BPF_CORE_READ(file_inode, i_mode);
+	if (S_ISDIR(mode) || S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) ||
+	    S_ISSOCK(mode))
+		goto out;
+
+	struct dentry* filp_dentry = BPF_CORE_READ(filp, f_path.dentry);
+	u32 device_id = 0;
+	u64 file_ino = 0;
+	if (!is_dentry_allowed_for_filemod(filp_dentry, &device_id, &file_ino))
+		goto out;
+
+	int zero = 0;
+	struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
+	if (!filemod_data)
+		goto out;
+
+	u32 pid = get_userspace_pid();
+	struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+	filemod_data->meta.type = FILEMOD_EVENT;
+	filemod_data->fmod_type = FMOD_OPEN;
+	filemod_data->dst_flags = flags;
+	filemod_data->src_inode = 0;
+	filemod_data->dst_inode = file_ino;
+	filemod_data->src_device_id = 0;
+	filemod_data->dst_device_id = device_id;
+	filemod_data->src_filepath_length = 0;
+	filemod_data->dst_filepath_length = 0;
+
+	void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
+					      filemod_data->payload);
+	payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
+
+	size_t len = read_absolute_file_path_from_dentry(filp_dentry, payload);
+	barrier_var(len);
+	if (len <= MAX_FILEPATH_LENGTH) {
+		barrier_var(len);
+		payload += len;
+		filemod_data->dst_filepath_length = len;
+	}
+	bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
+	unsigned long data_len = payload - (void*)filemod_data;
+	data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
+	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
+out:
+	bpf_stats_exit(&stats_ctx);
+	return 0;
+}
+
+SEC("kprobe/vfs_link")
+int BPF_KPROBE(kprobe__vfs_link,
+	       struct dentry* old_dentry, struct inode* dir,
+	       struct dentry* new_dentry, struct inode** delegated_inode)
+{
+	struct bpf_func_stats_ctx stats_ctx;
+	bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_link);
+
+	u32 src_device_id = 0;
+	u64 src_file_ino = 0;
+	u32 dst_device_id = 0;
+	u64 dst_file_ino = 0;
+	if (!is_dentry_allowed_for_filemod(old_dentry, &src_device_id, &src_file_ino) &&
+	    !is_dentry_allowed_for_filemod(new_dentry, &dst_device_id, &dst_file_ino))
+		goto out;
+
+	int zero = 0;
+	struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
+	if (!filemod_data)
+		goto out;
+
+	u32 pid = get_userspace_pid();
+	struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+	filemod_data->meta.type = FILEMOD_EVENT;
+	filemod_data->fmod_type = FMOD_LINK;
+	filemod_data->dst_flags = 0;
+	filemod_data->src_inode = src_file_ino;
+	filemod_data->dst_inode = dst_file_ino;
+	filemod_data->src_device_id = src_device_id;
+	filemod_data->dst_device_id = dst_device_id;
+	filemod_data->src_filepath_length = 0;
+	filemod_data->dst_filepath_length = 0;
+
+	void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
+					      filemod_data->payload);
+	payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
+
+	size_t len = read_absolute_file_path_from_dentry(old_dentry, payload);
+	barrier_var(len);
+	if (len <= MAX_FILEPATH_LENGTH) {
+		barrier_var(len);
+		payload += len;
+		filemod_data->src_filepath_length = len;
+	}
+
+	len = read_absolute_file_path_from_dentry(new_dentry, payload);
+	barrier_var(len);
+	if (len <= MAX_FILEPATH_LENGTH) {
+		barrier_var(len);
+		payload += len;
+		filemod_data->dst_filepath_length = len;
+	}
+
+	bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
+	unsigned long data_len = payload - (void*)filemod_data;
+	data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
+	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
+out:
+	bpf_stats_exit(&stats_ctx);
+	return 0;
+}
+
+SEC("kprobe/vfs_symlink")
+int BPF_KPROBE(kprobe__vfs_symlink, struct inode* dir, struct dentry* dentry,
+	       const char* oldname)
+{
+	struct bpf_func_stats_ctx stats_ctx;
+	bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_symlink);
+
+	u32 dst_device_id = 0;
+	u64 dst_file_ino = 0;
+	if (!is_dentry_allowed_for_filemod(dentry, &dst_device_id, &dst_file_ino))
+		goto out;
+
+	int zero = 0;
+	struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
+	if (!filemod_data)
+		goto out;
+
+	u32 pid = get_userspace_pid();
+	struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+	filemod_data->meta.type = FILEMOD_EVENT;
+	filemod_data->fmod_type = FMOD_SYMLINK;
+	filemod_data->dst_flags = 0;
+	filemod_data->src_inode = 0;
+	filemod_data->dst_inode = dst_file_ino;
+	filemod_data->src_device_id = 0;
+	filemod_data->dst_device_id = dst_device_id;
+	filemod_data->src_filepath_length = 0;
+	filemod_data->dst_filepath_length = 0;
+
+	void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
+					      filemod_data->payload);
+	payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
+
+	size_t len = bpf_probe_read_str(payload, MAX_FILEPATH_LENGTH, oldname);
+	barrier_var(len);
+	if (len <= MAX_FILEPATH_LENGTH) {
+		barrier_var(len);
+		payload += len;
+		filemod_data->src_filepath_length = len;
+	}
+	len = read_absolute_file_path_from_dentry(dentry, payload);
+	barrier_var(len);
+	if (len <= MAX_FILEPATH_LENGTH) {
+		barrier_var(len);
+		payload += len;
+		filemod_data->dst_filepath_length = len;
+	}
+	bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
+	unsigned long data_len = payload - (void*)filemod_data;
+	data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
+	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
+out:
+	bpf_stats_exit(&stats_ctx);
+	return 0;
+}
+
+SEC("raw_tracepoint/sched_process_fork")
+int raw_tracepoint__sched_process_fork(struct bpf_raw_tracepoint_args* ctx)
+{
+	struct bpf_func_stats_ctx stats_ctx;
+	bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_fork);
+
+	int zero = 0;
+	struct var_fork_data_t* fork_data = bpf_map_lookup_elem(&data_heap, &zero);
+	if (!fork_data)
+		goto out;
+
+	struct task_struct* parent = (struct task_struct*)ctx->args[0];
+	struct task_struct* child = (struct task_struct*)ctx->args[1];
+	fork_data->meta.type = FORK_EVENT;
+
+	void* payload = populate_var_metadata(&fork_data->meta, child,
+					      BPF_CORE_READ(child, pid), fork_data->payload);
+	fork_data->parent_pid = BPF_CORE_READ(parent, pid);
+	fork_data->parent_exec_id = BPF_CORE_READ(parent, self_exec_id);
+	fork_data->parent_start_time = BPF_CORE_READ(parent, start_time);
+	bpf_stats_pre_submit_var_perf_event(&stats_ctx, &fork_data->meta);
+
+	unsigned long data_len = payload - (void*)fork_data;
+	data_len = data_len > sizeof(*fork_data) ? sizeof(*fork_data) : data_len;
+	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, fork_data, data_len);
+out:
+	bpf_stats_exit(&stats_ctx);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/profiler1.c b/tools/testing/selftests/bpf/progs/profiler1.c
new file mode 100644
index 000000000000..4df9088bfc00
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/profiler1.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var))
+#define UNROLL
+#define INLINE __always_inline
+#include "profiler.inc.h"
diff --git a/tools/testing/selftests/bpf/progs/profiler2.c b/tools/testing/selftests/bpf/progs/profiler2.c
new file mode 100644
index 000000000000..0f32a3cbf556
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/profiler2.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define barrier_var(var) /**/
+/* undef #define UNROLL */
+#define INLINE /**/
+#include "profiler.inc.h"
diff --git a/tools/testing/selftests/bpf/progs/profiler3.c b/tools/testing/selftests/bpf/progs/profiler3.c
new file mode 100644
index 000000000000..6249fc31ccb0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/profiler3.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define barrier_var(var) /**/
+#define UNROLL
+#define INLINE __noinline
+#include "profiler.inc.h"

From 54fada41e8a162a45ab13ac4f76af609733d4679 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 8 Oct 2020 18:12:40 -0700
Subject: [PATCH 41/62] selftests/bpf: Asm tests for the verifier regalloc
 tracking.

Add asm tests for register allocator tracking logic.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201009011240.48506-5-alexei.starovoitov@gmail.com
---
 .../testing/selftests/bpf/verifier/regalloc.c | 243 ++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/verifier/regalloc.c

diff --git a/tools/testing/selftests/bpf/verifier/regalloc.c b/tools/testing/selftests/bpf/verifier/regalloc.c
new file mode 100644
index 000000000000..ac71b824f97a
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/regalloc.c
@@ -0,0 +1,243 @@
+{
+	"regalloc basic",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 4),
+	BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_48b = { 4 },
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"regalloc negative",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 24, 4),
+	BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+	BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_7, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_48b = { 4 },
+	.result = REJECT,
+	.errstr = "invalid access to map value, value_size=48 off=48 size=1",
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"regalloc src_reg mark",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 5),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_48b = { 4 },
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"regalloc src_reg negative",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 22, 5),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_48b = { 4 },
+	.result = REJECT,
+	.errstr = "invalid access to map value, value_size=48 off=44 size=8",
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"regalloc and spill",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 7),
+	/* r0 has upper bound that should propagate into r2 */
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */
+	BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */
+	BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2),
+	/* r3 has lower and upper bounds */
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_48b = { 4 },
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"regalloc and spill negative",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 48, 7),
+	/* r0 has upper bound that should propagate into r2 */
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */
+	BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */
+	BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2),
+	/* r3 has lower and upper bounds */
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_48b = { 4 },
+	.result = REJECT,
+	.errstr = "invalid access to map value, value_size=48 off=48 size=8",
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"regalloc three regs",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+	BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 12, 5),
+	BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_4),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_48b = { 4 },
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"regalloc after call",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+	BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+	BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6),
+	BPF_JMP_IMM(BPF_JSGT, BPF_REG_8, 20, 4),
+	BPF_JMP_IMM(BPF_JSLT, BPF_REG_9, 0, 3),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_8),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_9),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_48b = { 4 },
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"regalloc in callee",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_MOV64_REG(BPF_REG_3, BPF_REG_7),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 20, 5),
+	BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_48b = { 4 },
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},

From dd2ce6a5373c6f5c830be54be10775458a8bd312 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Oct 2020 01:40:01 +0200
Subject: [PATCH 42/62] bpf: Improve bpf_redirect_neigh helper description

Follow-up to address David's feedback that we should better describe internals
of the bpf_redirect_neigh() helper.

Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: David Ahern <dsahern@gmail.com>
Link: https://lore.kernel.org/bpf/20201010234006.7075-2-daniel@iogearbox.net
---
 include/uapi/linux/bpf.h       | 10 +++++++---
 tools/include/uapi/linux/bpf.h | 10 +++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 42d2df799397..4272cc53d478 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3679,10 +3679,14 @@ union bpf_attr {
  * 		Redirect the packet to another net device of index *ifindex*
  * 		and fill in L2 addresses from neighboring subsystem. This helper
  * 		is somewhat similar to **bpf_redirect**\ (), except that it
- * 		fills in e.g. MAC addresses based on the L3 information from
- * 		the packet. This helper is supported for IPv4 and IPv6 protocols.
+ * 		populates L2 addresses as well, meaning, internally, the helper
+ * 		performs a FIB lookup based on the skb's networking header to
+ * 		get the address of the next hop and then relies on the neighbor
+ * 		lookup for the L2 address of the nexthop.
+ *
  * 		The *flags* argument is reserved and must be 0. The helper is
- * 		currently only supported for tc BPF program types.
+ * 		currently only supported for tc BPF program types, and enabled
+ * 		for IPv4 and IPv6 protocols.
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 42d2df799397..4272cc53d478 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3679,10 +3679,14 @@ union bpf_attr {
  * 		Redirect the packet to another net device of index *ifindex*
  * 		and fill in L2 addresses from neighboring subsystem. This helper
  * 		is somewhat similar to **bpf_redirect**\ (), except that it
- * 		fills in e.g. MAC addresses based on the L3 information from
- * 		the packet. This helper is supported for IPv4 and IPv6 protocols.
+ * 		populates L2 addresses as well, meaning, internally, the helper
+ * 		performs a FIB lookup based on the skb's networking header to
+ * 		get the address of the next hop and then relies on the neighbor
+ * 		lookup for the L2 address of the nexthop.
+ *
  * 		The *flags* argument is reserved and must be 0. The helper is
- * 		currently only supported for tc BPF program types.
+ * 		currently only supported for tc BPF program types, and enabled
+ * 		for IPv4 and IPv6 protocols.
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.

From 9aa1206e8f48222f35a0c809f33b2f4aaa1e2661 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Oct 2020 01:40:02 +0200
Subject: [PATCH 43/62] bpf: Add redirect_peer helper

Add an efficient ingress to ingress netns switch that can be used out of tc BPF
programs in order to redirect traffic from host ns ingress into a container
veth device ingress without having to go via CPU backlog queue [0]. For local
containers this can also be utilized and path via CPU backlog queue only needs
to be taken once, not twice. On a high level this borrows from ipvlan which does
similar switch in __netif_receive_skb_core() and then iterates via another_round.
This helps to reduce latency for mentioned use cases.

Pod to remote pod with redirect(), TCP_RR [1]:

  # percpu_netperf 10.217.1.33
          RT_LATENCY:         122.450         (per CPU:         122.666         122.401         122.333         122.401 )
        MEAN_LATENCY:         121.210         (per CPU:         121.100         121.260         121.320         121.160 )
      STDDEV_LATENCY:         120.040         (per CPU:         119.420         119.910         125.460         115.370 )
         MIN_LATENCY:          46.500         (per CPU:          47.000          47.000          47.000          45.000 )
         P50_LATENCY:         118.500         (per CPU:         118.000         119.000         118.000         119.000 )
         P90_LATENCY:         127.500         (per CPU:         127.000         128.000         127.000         128.000 )
         P99_LATENCY:         130.750         (per CPU:         131.000         131.000         129.000         132.000 )

    TRANSACTION_RATE:       32666.400         (per CPU:        8152.200        8169.842        8174.439        8169.897 )

Pod to remote pod with redirect_peer(), TCP_RR:

  # percpu_netperf 10.217.1.33
          RT_LATENCY:          44.449         (per CPU:          43.767          43.127          45.279          45.622 )
        MEAN_LATENCY:          45.065         (per CPU:          44.030          45.530          45.190          45.510 )
      STDDEV_LATENCY:          84.823         (per CPU:          66.770          97.290          84.380          90.850 )
         MIN_LATENCY:          33.500         (per CPU:          33.000          33.000          34.000          34.000 )
         P50_LATENCY:          43.250         (per CPU:          43.000          43.000          43.000          44.000 )
         P90_LATENCY:          46.750         (per CPU:          46.000          47.000          47.000          47.000 )
         P99_LATENCY:          52.750         (per CPU:          51.000          54.000          53.000          53.000 )

    TRANSACTION_RATE:       90039.500         (per CPU:       22848.186       23187.089       22085.077       21919.130 )

  [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf
  [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net
---
 drivers/net/veth.c             |  9 ++++++
 include/linux/netdevice.h      |  4 +++
 include/uapi/linux/bpf.h       | 17 +++++++++++
 net/core/dev.c                 | 15 ++++++++--
 net/core/filter.c              | 54 +++++++++++++++++++++++++++++-----
 tools/include/uapi/linux/bpf.h | 17 +++++++++++
 6 files changed, 106 insertions(+), 10 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 091e5b4ba042..8c737668008a 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev)
 	return smp_processor_id() % dev->real_num_rx_queues;
 }
 
+static struct net_device *veth_peer_dev(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+
+	/* Callers must be under RCU read side. */
+	return rcu_dereference(priv->peer);
+}
+
 static int veth_xdp_xmit(struct net_device *dev, int n,
 			 struct xdp_frame **frames,
 			 u32 flags, bool ndo_xmit)
@@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = {
 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
 	.ndo_bpf		= veth_xdp,
 	.ndo_xdp_xmit		= veth_ndo_xdp_xmit,
+	.ndo_get_peer_dev	= veth_peer_dev,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 28cfa53daf72..0533f86018dd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1277,6 +1277,9 @@ struct netdev_net_notifier {
  * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
  *			 int cmd);
  *	Add, change, delete or get information on an IPv4 tunnel.
+ * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
+ *	If a device is paired with a peer device, return the peer instance.
+ *	The caller must be under RCU read context.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1484,6 +1487,7 @@ struct net_device_ops {
 	struct devlink_port *	(*ndo_get_devlink_port)(struct net_device *dev);
 	int			(*ndo_tunnel_ctl)(struct net_device *dev,
 						  struct ip_tunnel_parm *p, int cmd);
+	struct net_device *	(*ndo_get_peer_dev)(struct net_device *dev);
 };
 
 /**
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4272cc53d478..b97bc5abb3b8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3719,6 +3719,22 @@ union bpf_attr {
  *		never return NULL.
  *	Return
  *		A pointer pointing to the kernel percpu variable on this cpu.
+ *
+ * long bpf_redirect_peer(u32 ifindex, u64 flags)
+ * 	Description
+ * 		Redirect the packet to another net device of index *ifindex*.
+ * 		This helper is somewhat similar to **bpf_redirect**\ (), except
+ * 		that the redirection happens to the *ifindex*' peer device and
+ * 		the netns switch takes place from ingress to ingress without
+ * 		going through the CPU's backlog queue.
+ *
+ * 		The *flags* argument is reserved and must be 0. The helper is
+ * 		currently only supported for tc BPF program types at the ingress
+ * 		hook and for veth device types. The peer device must reside in a
+ * 		different network namespace.
+ * 	Return
+ * 		The helper returns **TC_ACT_REDIRECT** on success or
+ * 		**TC_ACT_SHOT** on error.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3876,6 +3892,7 @@ union bpf_attr {
 	FN(redirect_neigh),		\
 	FN(bpf_per_cpu_ptr),            \
 	FN(bpf_this_cpu_ptr),		\
+	FN(redirect_peer),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/net/core/dev.c b/net/core/dev.c
index 9d55bf5d1a65..7dd015823593 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 
 static inline struct sk_buff *
 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
-		   struct net_device *orig_dev)
+		   struct net_device *orig_dev, bool *another)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
@@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		 * redirecting to another netdev
 		 */
 		__skb_push(skb, skb->mac_len);
-		skb_do_redirect(skb);
+		if (skb_do_redirect(skb) == -EAGAIN) {
+			__skb_pull(skb, skb->mac_len);
+			*another = true;
+			break;
+		}
 		return NULL;
 	case TC_ACT_CONSUMED:
 		return NULL;
@@ -5163,7 +5167,12 @@ another_round:
 skip_taps:
 #ifdef CONFIG_NET_INGRESS
 	if (static_branch_unlikely(&ingress_needed_key)) {
-		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+		bool another = false;
+
+		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
+					 &another);
+		if (another)
+			goto another_round;
 		if (!skb)
 			goto out;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 5da44b11e1ec..fab951c6be57 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2380,8 +2380,9 @@ out:
 
 /* Internal, non-exposed redirect flags. */
 enum {
-	BPF_F_NEIGH = (1ULL << 1),
-#define BPF_F_REDIRECT_INTERNAL	(BPF_F_NEIGH)
+	BPF_F_NEIGH	= (1ULL << 1),
+	BPF_F_PEER	= (1ULL << 2),
+#define BPF_F_REDIRECT_INTERNAL	(BPF_F_NEIGH | BPF_F_PEER)
 };
 
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2430,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
 int skb_do_redirect(struct sk_buff *skb)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+	struct net *net = dev_net(skb->dev);
 	struct net_device *dev;
 	u32 flags = ri->flags;
 
-	dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
+	dev = dev_get_by_index_rcu(net, ri->tgt_index);
 	ri->tgt_index = 0;
-	if (unlikely(!dev)) {
-		kfree_skb(skb);
-		return -EINVAL;
-	}
+	ri->flags = 0;
+	if (unlikely(!dev))
+		goto out_drop;
+	if (flags & BPF_F_PEER) {
+		const struct net_device_ops *ops = dev->netdev_ops;
 
+		if (unlikely(!ops->ndo_get_peer_dev ||
+			     !skb_at_tc_ingress(skb)))
+			goto out_drop;
+		dev = ops->ndo_get_peer_dev(dev);
+		if (unlikely(!dev ||
+			     !is_skb_forwardable(dev, skb) ||
+			     net_eq(net, dev_net(dev))))
+			goto out_drop;
+		skb->dev = dev;
+		return -EAGAIN;
+	}
 	return flags & BPF_F_NEIGH ?
 	       __bpf_redirect_neigh(skb, dev) :
 	       __bpf_redirect(skb, dev, flags);
+out_drop:
+	kfree_skb(skb);
+	return -EINVAL;
 }
 
 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
@@ -2466,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
+{
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+	if (unlikely(flags))
+		return TC_ACT_SHOT;
+
+	ri->flags = BPF_F_PEER;
+	ri->tgt_index = ifindex;
+
+	return TC_ACT_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_redirect_peer_proto = {
+	.func           = bpf_redirect_peer,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_ANYTHING,
+	.arg2_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
@@ -7053,6 +7091,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_redirect_proto;
 	case BPF_FUNC_redirect_neigh:
 		return &bpf_redirect_neigh_proto;
+	case BPF_FUNC_redirect_peer:
+		return &bpf_redirect_peer_proto;
 	case BPF_FUNC_get_route_realm:
 		return &bpf_get_route_realm_proto;
 	case BPF_FUNC_get_hash_recalc:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4272cc53d478..b97bc5abb3b8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3719,6 +3719,22 @@ union bpf_attr {
  *		never return NULL.
  *	Return
  *		A pointer pointing to the kernel percpu variable on this cpu.
+ *
+ * long bpf_redirect_peer(u32 ifindex, u64 flags)
+ * 	Description
+ * 		Redirect the packet to another net device of index *ifindex*.
+ * 		This helper is somewhat similar to **bpf_redirect**\ (), except
+ * 		that the redirection happens to the *ifindex*' peer device and
+ * 		the netns switch takes place from ingress to ingress without
+ * 		going through the CPU's backlog queue.
+ *
+ * 		The *flags* argument is reserved and must be 0. The helper is
+ * 		currently only supported for tc BPF program types at the ingress
+ * 		hook and for veth device types. The peer device must reside in a
+ * 		different network namespace.
+ * 	Return
+ * 		The helper returns **TC_ACT_REDIRECT** on success or
+ * 		**TC_ACT_SHOT** on error.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3876,6 +3892,7 @@ union bpf_attr {
 	FN(redirect_neigh),		\
 	FN(bpf_per_cpu_ptr),            \
 	FN(bpf_this_cpu_ptr),		\
+	FN(redirect_peer),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper

From 4a8f87e60f6db40e640f1db555d063b2c4dea5f1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Oct 2020 01:40:03 +0200
Subject: [PATCH 44/62] bpf: Allow for map-in-map with dynamic inner array map
 entries

Recent work in f4d05259213f ("bpf: Add map_meta_equal map ops") and 134fede4eecf
("bpf: Relax max_entries check for most of the inner map types") added support
for dynamic inner max elements for most map-in-map types. Exceptions were maps
like array or prog array where the map_gen_lookup() callback uses the maps'
max_entries field as a constant when emitting instructions.

We recently implemented Maglev consistent hashing into Cilium's load balancer
which uses map-in-map with an outer map being hash and inner being array holding
the Maglev backend table for each service. This has been designed this way in
order to reduce overall memory consumption given the outer hash map allows to
avoid preallocating a large, flat memory area for all services. Also, the
number of service mappings is not always known a-priori.

The use case for dynamic inner array map entries is to further reduce memory
overhead, for example, some services might just have a small number of back
ends while others could have a large number. Right now the Maglev backend table
for small and large number of backends would need to have the same inner array
map entries which adds a lot of unneeded overhead.

Dynamic inner array map entries can be realized by avoiding the inlined code
generation for their lookup. The lookup will still be efficient since it will
be calling into array_map_lookup_elem() directly and thus avoiding retpoline.
The patch adds a BPF_F_INNER_MAP flag to map creation which therefore skips
inline code generation and relaxes array_map_meta_equal() check to ignore both
maps' max_entries. This also still allows to have faster lookups for map-in-map
when BPF_F_INNER_MAP is not specified and hence dynamic max_entries not needed.

Example code generation where inner map is dynamic sized array:

  # bpftool p d x i 125
  int handle__sys_enter(void * ctx):
  ; int handle__sys_enter(void *ctx)
     0: (b4) w1 = 0
  ; int key = 0;
     1: (63) *(u32 *)(r10 -4) = r1
     2: (bf) r2 = r10
  ;
     3: (07) r2 += -4
  ; inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key);
     4: (18) r1 = map[id:468]
     6: (07) r1 += 272
     7: (61) r0 = *(u32 *)(r2 +0)
     8: (35) if r0 >= 0x3 goto pc+5
     9: (67) r0 <<= 3
    10: (0f) r0 += r1
    11: (79) r0 = *(u64 *)(r0 +0)
    12: (15) if r0 == 0x0 goto pc+1
    13: (05) goto pc+1
    14: (b7) r0 = 0
    15: (b4) w6 = -1
  ; if (!inner_map)
    16: (15) if r0 == 0x0 goto pc+6
    17: (bf) r2 = r10
  ;
    18: (07) r2 += -4
  ; val = bpf_map_lookup_elem(inner_map, &key);
    19: (bf) r1 = r0                               | No inlining but instead
    20: (85) call array_map_lookup_elem#149280     | call to array_map_lookup_elem()
  ; return val ? *val : -1;                        | for inner array lookup.
    21: (15) if r0 == 0x0 goto pc+1
  ; return val ? *val : -1;
    22: (61) r6 = *(u32 *)(r0 +0)
  ; }
    23: (bc) w0 = w6
    24: (95) exit

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201010234006.7075-4-daniel@iogearbox.net
---
 include/linux/bpf.h            |  2 +-
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/arraymap.c          | 17 +++++++++++------
 kernel/bpf/hashtab.c           |  6 +++---
 kernel/bpf/verifier.c          |  6 ++++--
 net/xdp/xskmap.c               |  2 +-
 tools/include/uapi/linux/bpf.h |  3 +++
 7 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index dc63eeed4fd9..2b16bf48aab6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -82,7 +82,7 @@ struct bpf_map_ops {
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
 				int fd);
 	void (*map_fd_put_ptr)(void *ptr);
-	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
+	int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
 	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
 				  struct seq_file *m);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b97bc5abb3b8..bf5a99d803e4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -435,6 +435,9 @@ enum {
 
 /* Share perf_event among processes */
 	BPF_F_PRESERVE_ELEMS	= (1U << 11),
+
+/* Create a map that is suitable to be an inner map with dynamic max entries */
+	BPF_F_INNER_MAP		= (1U << 12),
 };
 
 /* Flags for BPF_PROG_QUERY. */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index bd777dd6f967..c6c81eceb68f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -16,7 +16,7 @@
 
 #define ARRAY_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
-	 BPF_F_PRESERVE_ELEMS)
+	 BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
 
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
@@ -62,7 +62,7 @@ int array_map_alloc_check(union bpf_attr *attr)
 		return -EINVAL;
 
 	if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
-	    attr->map_flags & BPF_F_MMAPABLE)
+	    attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
 		return -EINVAL;
 
 	if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
@@ -214,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
 }
 
 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
-static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_insn *insn = insn_buf;
@@ -223,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 	const int map_ptr = BPF_REG_1;
 	const int index = BPF_REG_2;
 
+	if (map->map_flags & BPF_F_INNER_MAP)
+		return -EOPNOTSUPP;
+
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
 	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
 	if (!map->bypass_spec_v1) {
@@ -496,8 +499,10 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
 static bool array_map_meta_equal(const struct bpf_map *meta0,
 				 const struct bpf_map *meta1)
 {
-	return meta0->max_entries == meta1->max_entries &&
-		bpf_map_meta_equal(meta0, meta1);
+	if (!bpf_map_meta_equal(meta0, meta1))
+		return false;
+	return meta0->map_flags & BPF_F_INNER_MAP ? true :
+	       meta0->max_entries == meta1->max_entries;
 }
 
 struct bpf_iter_seq_array_map_info {
@@ -1251,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
 	return READ_ONCE(*inner_map);
 }
 
-static u32 array_of_map_gen_lookup(struct bpf_map *map,
+static int array_of_map_gen_lookup(struct bpf_map *map,
 				   struct bpf_insn *insn_buf)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3395cf140d22..1815e97d4c9c 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
  * bpf_prog
  *   __htab_map_lookup_elem
  */
-static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
@@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
 	return __htab_lru_map_lookup_elem(map, key, false);
 }
 
-static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+static int htab_lru_map_gen_lookup(struct bpf_map *map,
 				   struct bpf_insn *insn_buf)
 {
 	struct bpf_insn *insn = insn_buf;
@@ -2070,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
 	return READ_ONCE(*inner_map);
 }
 
-static u32 htab_of_map_gen_lookup(struct bpf_map *map,
+static int htab_of_map_gen_lookup(struct bpf_map *map,
 				  struct bpf_insn *insn_buf)
 {
 	struct bpf_insn *insn = insn_buf;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f3e36eade3d4..fa5badc9279a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11049,7 +11049,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			if (insn->imm == BPF_FUNC_map_lookup_elem &&
 			    ops->map_gen_lookup) {
 				cnt = ops->map_gen_lookup(map_ptr, insn_buf);
-				if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+				if (cnt == -EOPNOTSUPP)
+					goto patch_map_ops_generic;
+				if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 					verbose(env, "bpf verifier is misconfigured\n");
 					return -EINVAL;
 				}
@@ -11079,7 +11081,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 				     (int (*)(struct bpf_map *map, void *value))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
 				     (int (*)(struct bpf_map *map, void *value))NULL));
-
+patch_map_ops_generic:
 			switch (insn->imm) {
 			case BPF_FUNC_map_lookup_elem:
 				insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 0c5df593bc56..49da2b8ace8b 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -132,7 +132,7 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
-static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
 	const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
 	struct bpf_insn *insn = insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b97bc5abb3b8..bf5a99d803e4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -435,6 +435,9 @@ enum {
 
 /* Share perf_event among processes */
 	BPF_F_PRESERVE_ELEMS	= (1U << 11),
+
+/* Create a map that is suitable to be an inner map with dynamic max entries */
+	BPF_F_INNER_MAP		= (1U << 12),
 };
 
 /* Flags for BPF_PROG_QUERY. */

From 6775dab73bdc17df0f19d9f9d7470dc54536a909 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Oct 2020 01:40:04 +0200
Subject: [PATCH 45/62] bpf, selftests: Add test for different array inner map
 size

Extend the "diff_size" subtest to also include a non-inlined array map variant
where dynamic inner #elems are possible.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20201010234006.7075-5-daniel@iogearbox.net
---
 .../selftests/bpf/prog_tests/btf_map_in_map.c | 39 ++++++++++++-----
 .../selftests/bpf/progs/test_btf_map_in_map.c | 43 +++++++++++++++++++
 2 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
index 540fea4c91a5..76ebe4c250f1 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
@@ -55,10 +55,10 @@ static int kern_sync_rcu(void)
 
 static void test_lookup_update(void)
 {
-	int err, key = 0, val, i;
+	int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id;
+	int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd;
 	struct test_btf_map_in_map *skel;
-	int outer_arr_fd, outer_hash_fd;
-	int fd, map1_fd, map2_fd, map1_id, map2_id;
+	int err, key = 0, val, i, fd;
 
 	skel = test_btf_map_in_map__open_and_load();
 	if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n"))
@@ -70,32 +70,45 @@ static void test_lookup_update(void)
 
 	map1_fd = bpf_map__fd(skel->maps.inner_map1);
 	map2_fd = bpf_map__fd(skel->maps.inner_map2);
+	map3_fd = bpf_map__fd(skel->maps.inner_map3);
+	map4_fd = bpf_map__fd(skel->maps.inner_map4);
+	map5_fd = bpf_map__fd(skel->maps.inner_map5);
+	outer_arr_dyn_fd = bpf_map__fd(skel->maps.outer_arr_dyn);
 	outer_arr_fd = bpf_map__fd(skel->maps.outer_arr);
 	outer_hash_fd = bpf_map__fd(skel->maps.outer_hash);
 
-	/* inner1 = input, inner2 = input + 1 */
-	map1_fd = bpf_map__fd(skel->maps.inner_map1);
+	/* inner1 = input, inner2 = input + 1, inner3 = input + 2 */
 	bpf_map_update_elem(outer_arr_fd, &key, &map1_fd, 0);
-	map2_fd = bpf_map__fd(skel->maps.inner_map2);
 	bpf_map_update_elem(outer_hash_fd, &key, &map2_fd, 0);
+	bpf_map_update_elem(outer_arr_dyn_fd, &key, &map3_fd, 0);
 	skel->bss->input = 1;
 	usleep(1);
-
 	bpf_map_lookup_elem(map1_fd, &key, &val);
 	CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1);
 	bpf_map_lookup_elem(map2_fd, &key, &val);
 	CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2);
+	bpf_map_lookup_elem(map3_fd, &key, &val);
+	CHECK(val != 3, "inner3", "got %d != exp %d\n", val, 3);
 
-	/* inner1 = input + 1, inner2 = input */
+	/* inner2 = input, inner1 = input + 1, inner4 = input + 2 */
 	bpf_map_update_elem(outer_arr_fd, &key, &map2_fd, 0);
 	bpf_map_update_elem(outer_hash_fd, &key, &map1_fd, 0);
+	bpf_map_update_elem(outer_arr_dyn_fd, &key, &map4_fd, 0);
 	skel->bss->input = 3;
 	usleep(1);
-
 	bpf_map_lookup_elem(map1_fd, &key, &val);
 	CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4);
 	bpf_map_lookup_elem(map2_fd, &key, &val);
 	CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3);
+	bpf_map_lookup_elem(map4_fd, &key, &val);
+	CHECK(val != 5, "inner4", "got %d != exp %d\n", val, 5);
+
+	/* inner5 = input + 2 */
+	bpf_map_update_elem(outer_arr_dyn_fd, &key, &map5_fd, 0);
+	skel->bss->input = 5;
+	usleep(1);
+	bpf_map_lookup_elem(map5_fd, &key, &val);
+	CHECK(val != 7, "inner5", "got %d != exp %d\n", val, 7);
 
 	for (i = 0; i < 5; i++) {
 		val = i % 2 ? map1_fd : map2_fd;
@@ -106,7 +119,13 @@ static void test_lookup_update(void)
 		}
 		err = bpf_map_update_elem(outer_arr_fd, &key, &val, 0);
 		if (CHECK_FAIL(err)) {
-			printf("failed to update hash_of_maps on iter #%d\n", i);
+			printf("failed to update array_of_maps on iter #%d\n", i);
+			goto cleanup;
+		}
+		val = i % 2 ? map4_fd : map5_fd;
+		err = bpf_map_update_elem(outer_arr_dyn_fd, &key, &val, 0);
+		if (CHECK_FAIL(err)) {
+			printf("failed to update array_of_maps (dyn) on iter #%d\n", i);
 			goto cleanup;
 		}
 	}
diff --git a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c
index 193fe0198b21..c1e0c8c7c55f 100644
--- a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c
+++ b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c
@@ -41,6 +41,43 @@ struct outer_arr {
 	.values = { (void *)&inner_map1, 0, (void *)&inner_map2 },
 };
 
+struct inner_map_sz3 {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(map_flags, BPF_F_INNER_MAP);
+	__uint(max_entries, 3);
+	__type(key, int);
+	__type(value, int);
+} inner_map3 SEC(".maps"),
+  inner_map4 SEC(".maps");
+
+struct inner_map_sz4 {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(map_flags, BPF_F_INNER_MAP);
+	__uint(max_entries, 5);
+	__type(key, int);
+	__type(value, int);
+} inner_map5 SEC(".maps");
+
+struct outer_arr_dyn {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(max_entries, 3);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(int));
+	__array(values, struct {
+		__uint(type, BPF_MAP_TYPE_ARRAY);
+		__uint(map_flags, BPF_F_INNER_MAP);
+		__uint(max_entries, 1);
+		__type(key, int);
+		__type(value, int);
+	});
+} outer_arr_dyn SEC(".maps") = {
+	.values = {
+		[0] = (void *)&inner_map3,
+		[1] = (void *)&inner_map4,
+		[2] = (void *)&inner_map5,
+	},
+};
+
 struct outer_hash {
 	__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
 	__uint(max_entries, 5);
@@ -101,6 +138,12 @@ int handle__sys_enter(void *ctx)
 	val = input + 1;
 	bpf_map_update_elem(inner_map, &key, &val, 0);
 
+	inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key);
+	if (!inner_map)
+		return 1;
+	val = input + 2;
+	bpf_map_update_elem(inner_map, &key, &val, 0);
+
 	return 0;
 }
 

From 57a73fe7c1988ae726de23ee7c3c49ca82221751 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Oct 2020 01:40:05 +0200
Subject: [PATCH 46/62] bpf, selftests: Make redirect_neigh test more
 extensible

Rename into test_tc_redirect.sh and move setup and test code into separate
functions so they can be reused for newly added tests in here. Also remove
the crude hack to override ifindex inside the object file via xxd and sed
and just use a simple map instead. Map given iproute2 does not support BTF
fully and therefore neither global data at this point.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20201010234006.7075-6-daniel@iogearbox.net
---
 .../selftests/bpf/progs/test_tc_neigh.c       |  40 ++--
 tools/testing/selftests/bpf/test_tc_neigh.sh  | 168 ---------------
 .../testing/selftests/bpf/test_tc_redirect.sh | 197 ++++++++++++++++++
 3 files changed, 219 insertions(+), 186 deletions(-)
 delete mode 100755 tools/testing/selftests/bpf/test_tc_neigh.sh
 create mode 100755 tools/testing/selftests/bpf/test_tc_redirect.sh

diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh.c b/tools/testing/selftests/bpf/progs/test_tc_neigh.c
index 889a72c3024f..fe182616b112 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_neigh.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_neigh.c
@@ -13,17 +13,10 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
-#ifndef barrier_data
-# define barrier_data(ptr)	asm volatile("": :"r"(ptr) :"memory")
-#endif
-
 #ifndef ctx_ptr
 # define ctx_ptr(field)		(void *)(long)(field)
 #endif
 
-#define dst_to_src_tmp		0xeeddddeeU
-#define src_to_dst_tmp		0xeeffffeeU
-
 #define ip4_src			0xac100164 /* 172.16.1.100 */
 #define ip4_dst			0xac100264 /* 172.16.2.100 */
 
@@ -39,6 +32,18 @@
 				 a.s6_addr32[3] == b.s6_addr32[3])
 #endif
 
+enum {
+	dev_src,
+	dev_dst,
+};
+
+struct bpf_map_def SEC("maps") ifindex_map = {
+	.type		= BPF_MAP_TYPE_ARRAY,
+	.key_size	= sizeof(int),
+	.value_size	= sizeof(int),
+	.max_entries	= 2,
+};
+
 static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb,
 					    __be32 addr)
 {
@@ -73,7 +78,14 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb,
 	return v6_equal(ip6h->daddr, addr);
 }
 
-SEC("chk_neigh") int tc_chk(struct __sk_buff *skb)
+static __always_inline int get_dev_ifindex(int which)
+{
+	int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
+
+	return ifindex ? *ifindex : 0;
+}
+
+SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
 {
 	void *data_end = ctx_ptr(skb->data_end);
 	void *data = ctx_ptr(skb->data);
@@ -87,7 +99,6 @@ SEC("chk_neigh") int tc_chk(struct __sk_buff *skb)
 
 SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
 {
-	int idx = dst_to_src_tmp;
 	__u8 zero[ETH_ALEN * 2];
 	bool redirect = false;
 
@@ -103,19 +114,15 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
 	if (!redirect)
 		return TC_ACT_OK;
 
-	barrier_data(&idx);
-	idx = bpf_ntohl(idx);
-
 	__builtin_memset(&zero, 0, sizeof(zero));
 	if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
 		return TC_ACT_SHOT;
 
-	return bpf_redirect_neigh(idx, 0);
+	return bpf_redirect_neigh(get_dev_ifindex(dev_src), 0);
 }
 
 SEC("src_ingress") int tc_src(struct __sk_buff *skb)
 {
-	int idx = src_to_dst_tmp;
 	__u8 zero[ETH_ALEN * 2];
 	bool redirect = false;
 
@@ -131,14 +138,11 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb)
 	if (!redirect)
 		return TC_ACT_OK;
 
-	barrier_data(&idx);
-	idx = bpf_ntohl(idx);
-
 	__builtin_memset(&zero, 0, sizeof(zero));
 	if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
 		return TC_ACT_SHOT;
 
-	return bpf_redirect_neigh(idx, 0);
+	return bpf_redirect_neigh(get_dev_ifindex(dev_dst), 0);
 }
 
 char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_tc_neigh.sh b/tools/testing/selftests/bpf/test_tc_neigh.sh
deleted file mode 100755
index 31d8c3df8b24..000000000000
--- a/tools/testing/selftests/bpf/test_tc_neigh.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
-# between src and dst. The netns fwd has veth links to each src and dst. The
-# client is in src and server in dst. The test installs a TC BPF program to each
-# host facing veth in fwd which calls into bpf_redirect_peer() to perform the
-# neigh addr population and redirect; it also installs a dropper prog on the
-# egress side to drop skbs if neigh addrs were not populated.
-
-if [[ $EUID -ne 0 ]]; then
-	echo "This script must be run as root"
-	echo "FAIL"
-	exit 1
-fi
-
-# check that nc, dd, ping, ping6 and timeout are present
-command -v nc >/dev/null 2>&1 || \
-	{ echo >&2 "nc is not available"; exit 1; }
-command -v dd >/dev/null 2>&1 || \
-	{ echo >&2 "dd is not available"; exit 1; }
-command -v timeout >/dev/null 2>&1 || \
-	{ echo >&2 "timeout is not available"; exit 1; }
-command -v ping >/dev/null 2>&1 || \
-	{ echo >&2 "ping is not available"; exit 1; }
-command -v ping6 >/dev/null 2>&1 || \
-	{ echo >&2 "ping6 is not available"; exit 1; }
-
-readonly GREEN='\033[0;92m'
-readonly RED='\033[0;31m'
-readonly NC='\033[0m' # No Color
-
-readonly PING_ARG="-c 3 -w 10 -q"
-
-readonly TIMEOUT=10
-
-readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
-readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
-readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
-
-readonly IP4_SRC="172.16.1.100"
-readonly IP4_DST="172.16.2.100"
-
-readonly IP6_SRC="::1:dead:beef:cafe"
-readonly IP6_DST="::2:dead:beef:cafe"
-
-readonly IP4_SLL="169.254.0.1"
-readonly IP4_DLL="169.254.0.2"
-readonly IP4_NET="169.254.0.0"
-
-cleanup()
-{
-	ip netns del ${NS_SRC}
-	ip netns del ${NS_FWD}
-	ip netns del ${NS_DST}
-}
-
-trap cleanup EXIT
-
-set -e
-
-ip netns add "${NS_SRC}"
-ip netns add "${NS_FWD}"
-ip netns add "${NS_DST}"
-
-ip link add veth_src type veth peer name veth_src_fwd
-ip link add veth_dst type veth peer name veth_dst_fwd
-
-ip link set veth_src netns ${NS_SRC}
-ip link set veth_src_fwd netns ${NS_FWD}
-
-ip link set veth_dst netns ${NS_DST}
-ip link set veth_dst_fwd netns ${NS_FWD}
-
-ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
-ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
-
-# The fwd netns automatically get a v6 LL address / routes, but also needs v4
-# one in order to start ARP probing. IP4_NET route is added to the endpoints
-# so that the ARP processing will reply.
-
-ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
-ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
-
-ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
-ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
-
-ip -netns ${NS_SRC} link set dev veth_src up
-ip -netns ${NS_FWD} link set dev veth_src_fwd up
-
-ip -netns ${NS_DST} link set dev veth_dst up
-ip -netns ${NS_FWD} link set dev veth_dst_fwd up
-
-ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
-ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
-ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
-
-ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
-ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
-
-ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
-ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
-ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
-
-ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
-ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
-
-fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
-fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
-
-ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
-ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
-
-ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
-ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
-
-veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex | awk '{printf "%08x\n", $1}')
-veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex | awk '{printf "%08x\n", $1}')
-
-xxd -p < test_tc_neigh.o   | sed "s/eeddddee/$veth_src/g" | xxd -r -p > test_tc_neigh.x.o
-xxd -p < test_tc_neigh.x.o | sed "s/eeffffee/$veth_dst/g" | xxd -r -p > test_tc_neigh.y.o
-
-ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
-ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.y.o sec src_ingress
-ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress  bpf da obj test_tc_neigh.y.o sec chk_neigh
-
-ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
-ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.y.o sec dst_ingress
-ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress  bpf da obj test_tc_neigh.y.o sec chk_neigh
-
-rm -f test_tc_neigh.x.o test_tc_neigh.y.o
-
-ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
-ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
-
-set +e
-
-TEST="TCPv4 connectivity test"
-ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
-if [ $? -ne 0 ]; then
-	echo -e "${TEST}: ${RED}FAIL${NC}"
-	exit 1
-fi
-echo -e "${TEST}: ${GREEN}PASS${NC}"
-
-TEST="TCPv6 connectivity test"
-ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
-if [ $? -ne 0 ]; then
-	echo -e "${TEST}: ${RED}FAIL${NC}"
-	exit 1
-fi
-echo -e "${TEST}: ${GREEN}PASS${NC}"
-
-TEST="ICMPv4 connectivity test"
-ip netns exec ${NS_SRC} ping  $PING_ARG ${IP4_DST}
-if [ $? -ne 0 ]; then
-	echo -e "${TEST}: ${RED}FAIL${NC}"
-	exit 1
-fi
-echo -e "${TEST}: ${GREEN}PASS${NC}"
-
-TEST="ICMPv6 connectivity test"
-ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST}
-if [ $? -ne 0 ]; then
-	echo -e "${TEST}: ${RED}FAIL${NC}"
-	exit 1
-fi
-echo -e "${TEST}: ${GREEN}PASS${NC}"
diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh
new file mode 100755
index 000000000000..6ad441405132
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tc_redirect.sh
@@ -0,0 +1,197 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
+# between src and dst. The netns fwd has veth links to each src and dst. The
+# client is in src and server in dst. The test installs a TC BPF program to each
+# host facing veth in fwd which calls into bpf_redirect_peer() to perform the
+# neigh addr population and redirect; it also installs a dropper prog on the
+# egress side to drop skbs if neigh addrs were not populated.
+
+if [[ $EUID -ne 0 ]]; then
+	echo "This script must be run as root"
+	echo "FAIL"
+	exit 1
+fi
+
+# check that needed tools are present
+command -v nc >/dev/null 2>&1 || \
+	{ echo >&2 "nc is not available"; exit 1; }
+command -v dd >/dev/null 2>&1 || \
+	{ echo >&2 "dd is not available"; exit 1; }
+command -v timeout >/dev/null 2>&1 || \
+	{ echo >&2 "timeout is not available"; exit 1; }
+command -v ping >/dev/null 2>&1 || \
+	{ echo >&2 "ping is not available"; exit 1; }
+command -v ping6 >/dev/null 2>&1 || \
+	{ echo >&2 "ping6 is not available"; exit 1; }
+command -v perl >/dev/null 2>&1 || \
+	{ echo >&2 "perl is not available"; exit 1; }
+command -v jq >/dev/null 2>&1 || \
+	{ echo >&2 "jq is not available"; exit 1; }
+command -v bpftool >/dev/null 2>&1 || \
+	{ echo >&2 "bpftool is not available"; exit 1; }
+
+readonly GREEN='\033[0;92m'
+readonly RED='\033[0;31m'
+readonly NC='\033[0m' # No Color
+
+readonly PING_ARG="-c 3 -w 10 -q"
+
+readonly TIMEOUT=10
+
+readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
+readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
+readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
+
+readonly IP4_SRC="172.16.1.100"
+readonly IP4_DST="172.16.2.100"
+
+readonly IP6_SRC="::1:dead:beef:cafe"
+readonly IP6_DST="::2:dead:beef:cafe"
+
+readonly IP4_SLL="169.254.0.1"
+readonly IP4_DLL="169.254.0.2"
+readonly IP4_NET="169.254.0.0"
+
+netns_cleanup()
+{
+	ip netns del ${NS_SRC}
+	ip netns del ${NS_FWD}
+	ip netns del ${NS_DST}
+}
+
+netns_setup()
+{
+	ip netns add "${NS_SRC}"
+	ip netns add "${NS_FWD}"
+	ip netns add "${NS_DST}"
+
+	ip link add veth_src type veth peer name veth_src_fwd
+	ip link add veth_dst type veth peer name veth_dst_fwd
+
+	ip link set veth_src netns ${NS_SRC}
+	ip link set veth_src_fwd netns ${NS_FWD}
+
+	ip link set veth_dst netns ${NS_DST}
+	ip link set veth_dst_fwd netns ${NS_FWD}
+
+	ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
+	ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
+
+	# The fwd netns automatically get a v6 LL address / routes, but also
+	# needs v4 one in order to start ARP probing. IP4_NET route is added
+	# to the endpoints so that the ARP processing will reply.
+
+	ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
+	ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
+
+	ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
+	ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
+
+	ip -netns ${NS_SRC} link set dev veth_src up
+	ip -netns ${NS_FWD} link set dev veth_src_fwd up
+
+	ip -netns ${NS_DST} link set dev veth_dst up
+	ip -netns ${NS_FWD} link set dev veth_dst_fwd up
+
+	ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
+	ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
+	ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
+
+	ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
+	ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
+
+	ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
+	ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
+	ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
+
+	ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
+	ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
+
+	fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
+	fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
+
+	ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
+	ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
+
+	ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
+	ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
+}
+
+netns_test_connectivity()
+{
+	set +e
+
+	ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
+	ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
+
+	TEST="TCPv4 connectivity test"
+	ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
+	if [ $? -ne 0 ]; then
+		echo -e "${TEST}: ${RED}FAIL${NC}"
+		exit 1
+	fi
+	echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+	TEST="TCPv6 connectivity test"
+	ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
+	if [ $? -ne 0 ]; then
+		echo -e "${TEST}: ${RED}FAIL${NC}"
+		exit 1
+	fi
+	echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+	TEST="ICMPv4 connectivity test"
+	ip netns exec ${NS_SRC} ping  $PING_ARG ${IP4_DST}
+	if [ $? -ne 0 ]; then
+		echo -e "${TEST}: ${RED}FAIL${NC}"
+		exit 1
+	fi
+	echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+	TEST="ICMPv6 connectivity test"
+	ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST}
+	if [ $? -ne 0 ]; then
+		echo -e "${TEST}: ${RED}FAIL${NC}"
+		exit 1
+	fi
+	echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+	set -e
+}
+
+hex_mem_str()
+{
+	perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1
+}
+
+netns_setup_neigh()
+{
+	ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
+	ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.o sec src_ingress
+	ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress  bpf da obj test_tc_neigh.o sec chk_egress
+
+	ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
+	ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.o sec dst_ingress
+	ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress  bpf da obj test_tc_neigh.o sec chk_egress
+
+	veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex)
+	veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex)
+
+	progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]')
+	for prog in $progs; do
+		map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]')
+		if [ ! -z "$map" ]; then
+			bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src)
+			bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst)
+		fi
+	done
+}
+
+trap netns_cleanup EXIT
+set -e
+
+netns_setup
+netns_setup_neigh
+netns_test_connectivity

From 9f4c53ca23a28c891d2bd3ff4738f7d95ba0303b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Oct 2020 01:40:06 +0200
Subject: [PATCH 47/62] bpf, selftests: Add redirect_peer selftest

Extend the test_tc_redirect test and add a small test that exercises the new
redirect_peer() helper for the IPv4 and IPv6 case.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201010234006.7075-7-daniel@iogearbox.net
---
 .../selftests/bpf/progs/test_tc_peer.c        | 45 +++++++++++++++++++
 .../testing/selftests/bpf/test_tc_redirect.sh | 25 +++++++----
 2 files changed, 61 insertions(+), 9 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/test_tc_peer.c

diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c
new file mode 100644
index 000000000000..fc84a7685aa2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <linux/bpf.h>
+#include <linux/stddef.h>
+#include <linux/pkt_cls.h>
+
+#include <bpf/bpf_helpers.h>
+
+enum {
+	dev_src,
+	dev_dst,
+};
+
+struct bpf_map_def SEC("maps") ifindex_map = {
+	.type		= BPF_MAP_TYPE_ARRAY,
+	.key_size	= sizeof(int),
+	.value_size	= sizeof(int),
+	.max_entries	= 2,
+};
+
+static __always_inline int get_dev_ifindex(int which)
+{
+	int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
+
+	return ifindex ? *ifindex : 0;
+}
+
+SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
+{
+	return TC_ACT_SHOT;
+}
+
+SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
+{
+	return bpf_redirect_peer(get_dev_ifindex(dev_src), 0);
+}
+
+SEC("src_ingress") int tc_src(struct __sk_buff *skb)
+{
+	return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0);
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh
index 6ad441405132..6d7482562140 100755
--- a/tools/testing/selftests/bpf/test_tc_redirect.sh
+++ b/tools/testing/selftests/bpf/test_tc_redirect.sh
@@ -4,9 +4,10 @@
 # This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
 # between src and dst. The netns fwd has veth links to each src and dst. The
 # client is in src and server in dst. The test installs a TC BPF program to each
-# host facing veth in fwd which calls into bpf_redirect_peer() to perform the
-# neigh addr population and redirect; it also installs a dropper prog on the
-# egress side to drop skbs if neigh addrs were not populated.
+# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
+# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
+# switch from ingress side; it also installs a checker prog on the egress side
+# to drop unexpected traffic.
 
 if [[ $EUID -ne 0 ]]; then
 	echo "This script must be run as root"
@@ -166,15 +167,17 @@ hex_mem_str()
 	perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1
 }
 
-netns_setup_neigh()
+netns_setup_bpf()
 {
+	local obj=$1
+
 	ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
-	ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.o sec src_ingress
-	ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress  bpf da obj test_tc_neigh.o sec chk_egress
+	ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress
+	ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress  bpf da obj $obj sec chk_egress
 
 	ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
-	ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.o sec dst_ingress
-	ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress  bpf da obj test_tc_neigh.o sec chk_egress
+	ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress
+	ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress  bpf da obj $obj sec chk_egress
 
 	veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex)
 	veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex)
@@ -193,5 +196,9 @@ trap netns_cleanup EXIT
 set -e
 
 netns_setup
-netns_setup_neigh
+netns_setup_bpf test_tc_neigh.o
+netns_test_connectivity
+netns_cleanup
+netns_setup
+netns_setup_bpf test_tc_peer.o
 netns_test_connectivity

From 8ac91df6de164923c1025e340ea81d085fb4ab85 Mon Sep 17 00:00:00 2001
From: "Daniel T. Lee" <danieltimlee@gmail.com>
Date: Sun, 11 Oct 2020 03:17:32 +0900
Subject: [PATCH 48/62] samples: bpf: Refactor xdp_monitor with libbpf

To avoid confusion caused by the increasing fragmentation of the BPF
Loader program, this commit would like to change to the libbpf loader
instead of using the bpf_load.

Thanks to libbpf's bpf_link interface, managing the tracepoint BPF
program is much easier. bpf_program__attach_tracepoint manages the
enable of tracepoint event and attach of BPF programs to it with a
single interface bpf_link, so there is no need to manage event_fd and
prog_fd separately.

This commit refactors xdp_monitor with using this libbpf API, and the
bpf_load is removed and migrated to libbpf.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201010181734.1109-2-danieltimlee@gmail.com
---
 samples/bpf/Makefile           |   2 +-
 samples/bpf/xdp_monitor_user.c | 159 +++++++++++++++++++++++++--------
 2 files changed, 121 insertions(+), 40 deletions(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index e29b1e89dd3e..50ad7e8b05bc 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -99,7 +99,7 @@ per_socket_stats_example-objs := cookie_uid_helper_example.o
 xdp_redirect-objs := xdp_redirect_user.o
 xdp_redirect_map-objs := xdp_redirect_map_user.o
 xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o
-xdp_monitor-objs := bpf_load.o xdp_monitor_user.o
+xdp_monitor-objs := xdp_monitor_user.o
 xdp_rxq_info-objs := xdp_rxq_info_user.o
 syscall_tp-objs := syscall_tp_user.o
 cpustat-objs := cpustat_user.o
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index ef53b93db573..03d0a182913f 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -26,12 +26,37 @@ static const char *__doc_err_only__=
 #include <net/if.h>
 #include <time.h>
 
+#include <signal.h>
 #include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
 #include "bpf_util.h"
 
+enum map_type {
+	REDIRECT_ERR_CNT,
+	EXCEPTION_CNT,
+	CPUMAP_ENQUEUE_CNT,
+	CPUMAP_KTHREAD_CNT,
+	DEVMAP_XMIT_CNT,
+};
+
+static const char *const map_type_strings[] = {
+	[REDIRECT_ERR_CNT] = "redirect_err_cnt",
+	[EXCEPTION_CNT] = "exception_cnt",
+	[CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
+	[CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
+	[DEVMAP_XMIT_CNT] = "devmap_xmit_cnt",
+};
+
+#define NUM_MAP 5
+#define NUM_TP 8
+
+static int tp_cnt;
+static int map_cnt;
 static int verbose = 1;
 static bool debug = false;
+struct bpf_map *map_data[NUM_MAP] = {};
+struct bpf_link *tp_links[NUM_TP] = {};
+struct bpf_object *obj;
 
 static const struct option long_options[] = {
 	{"help",	no_argument,		NULL, 'h' },
@@ -41,6 +66,16 @@ static const struct option long_options[] = {
 	{0, 0, NULL,  0 }
 };
 
+static void int_exit(int sig)
+{
+	/* Detach tracepoints */
+	while (tp_cnt)
+		bpf_link__destroy(tp_links[--tp_cnt]);
+
+	bpf_object__close(obj);
+	exit(0);
+}
+
 /* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
 #define EXIT_FAIL_MEM	5
 
@@ -483,23 +518,23 @@ static bool stats_collect(struct stats_record *rec)
 	 * this can happen by someone running perf-record -e
 	 */
 
-	fd = map_data[0].fd; /* map0: redirect_err_cnt */
+	fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]);
 	for (i = 0; i < REDIR_RES_MAX; i++)
 		map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
 
-	fd = map_data[1].fd; /* map1: exception_cnt */
+	fd = bpf_map__fd(map_data[EXCEPTION_CNT]);
 	for (i = 0; i < XDP_ACTION_MAX; i++) {
 		map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
 	}
 
-	fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
+	fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]);
 	for (i = 0; i < MAX_CPUS; i++)
 		map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
 
-	fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
+	fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]);
 	map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
 
-	fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
+	fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]);
 	map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
 
 	return true;
@@ -598,8 +633,8 @@ static void stats_poll(int interval, bool err_only)
 
 	/* TODO Need more advanced stats on error types */
 	if (verbose) {
-		printf(" - Stats map0: %s\n", map_data[0].name);
-		printf(" - Stats map1: %s\n", map_data[1].name);
+		printf(" - Stats map0: %s\n", bpf_map__name(map_data[0]));
+		printf(" - Stats map1: %s\n", bpf_map__name(map_data[1]));
 		printf("\n");
 	}
 	fflush(stdout);
@@ -618,44 +653,51 @@ static void stats_poll(int interval, bool err_only)
 
 static void print_bpf_prog_info(void)
 {
-	int i;
+	struct bpf_program *prog;
+	struct bpf_map *map;
+	int i = 0;
 
 	/* Prog info */
-	printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt);
-	for (i = 0; i < prog_cnt; i++) {
-		printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]);
+	printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt);
+	bpf_object__for_each_program(prog, obj) {
+		printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog));
+		i++;
 	}
 
+	i = 0;
 	/* Maps info */
-	printf("Loaded BPF prog have %d map(s)\n", map_data_count);
-	for (i = 0; i < map_data_count; i++) {
-		char *name = map_data[i].name;
-		int fd     = map_data[i].fd;
+	printf("Loaded BPF prog have %d map(s)\n", map_cnt);
+	bpf_object__for_each_map(map, obj) {
+		const char *name = bpf_map__name(map);
+		int fd		 = bpf_map__fd(map);
 
 		printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name);
+		i++;
 	}
 
 	/* Event info */
-	printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt);
-	for (i = 0; i < prog_cnt; i++) {
-		if (event_fd[i] != -1)
-			printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]);
+	printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt);
+	for (i = 0; i < tp_cnt; i++) {
+		int fd = bpf_link__fd(tp_links[i]);
+
+		if (fd != -1)
+			printf(" - event_fd[%d] = fd(%d)\n", i, fd);
 	}
 }
 
 int main(int argc, char **argv)
 {
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	struct bpf_program *prog;
 	int longindex = 0, opt;
-	int ret = EXIT_SUCCESS;
-	char bpf_obj_file[256];
+	int ret = EXIT_FAILURE;
+	enum map_type type;
+	char filename[256];
 
 	/* Default settings: */
 	bool errors_only = true;
 	int interval = 2;
 
-	snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
-
 	/* Parse commands line args */
 	while ((opt = getopt_long(argc, argv, "hDSs:",
 				  long_options, &longindex)) != -1) {
@@ -672,40 +714,79 @@ int main(int argc, char **argv)
 		case 'h':
 		default:
 			usage(argv);
-			return EXIT_FAILURE;
+			return ret;
 		}
 	}
 
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
 		perror("setrlimit(RLIMIT_MEMLOCK)");
-		return EXIT_FAILURE;
+		return ret;
 	}
 
-	if (load_bpf_file(bpf_obj_file)) {
-		printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
-		return EXIT_FAILURE;
+	/* Remove tracepoint program when program is interrupted or killed */
+	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
+
+	obj = bpf_object__open_file(filename, NULL);
+	if (libbpf_get_error(obj)) {
+		printf("ERROR: opening BPF object file failed\n");
+		obj = NULL;
+		goto cleanup;
 	}
-	if (!prog_fd[0]) {
-		printf("ERROR - load_bpf_file: %s\n", strerror(errno));
-		return EXIT_FAILURE;
+
+	/* load BPF program */
+	if (bpf_object__load(obj)) {
+		printf("ERROR: loading BPF object file failed\n");
+		goto cleanup;
+	}
+
+	for (type = 0; type < NUM_MAP; type++) {
+		map_data[type] =
+			bpf_object__find_map_by_name(obj, map_type_strings[type]);
+
+		if (libbpf_get_error(map_data[type])) {
+			printf("ERROR: finding a map in obj file failed\n");
+			goto cleanup;
+		}
+		map_cnt++;
+	}
+
+	bpf_object__for_each_program(prog, obj) {
+		tp_links[tp_cnt] = bpf_program__attach(prog);
+		if (libbpf_get_error(tp_links[tp_cnt])) {
+			printf("ERROR: bpf_program__attach failed\n");
+			tp_links[tp_cnt] = NULL;
+			goto cleanup;
+		}
+		tp_cnt++;
 	}
 
 	if (debug) {
 		print_bpf_prog_info();
 	}
 
-	/* Unload/stop tracepoint event by closing fd's */
+	/* Unload/stop tracepoint event by closing bpf_link's */
 	if (errors_only) {
-		/* The prog_fd[i] and event_fd[i] depend on the
-		 * order the functions was defined in _kern.c
+		/* The bpf_link[i] depend on the order of
+		 * the functions was defined in _kern.c
 		 */
-		close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */
-		close(prog_fd[2]);  /* func: trace_xdp_redirect */
-		close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */
-		close(prog_fd[3]);  /* func: trace_xdp_redirect_map */
+		bpf_link__destroy(tp_links[2]);	/* tracepoint/xdp/xdp_redirect */
+		tp_links[2] = NULL;
+
+		bpf_link__destroy(tp_links[3]);	/* tracepoint/xdp/xdp_redirect_map */
+		tp_links[3] = NULL;
 	}
 
 	stats_poll(interval, errors_only);
 
+	ret = EXIT_SUCCESS;
+
+cleanup:
+	/* Detach tracepoints */
+	while (tp_cnt)
+		bpf_link__destroy(tp_links[--tp_cnt]);
+
+	bpf_object__close(obj);
 	return ret;
 }

From 151936bf51afcdd6db52b3b73e339d021064b0ea Mon Sep 17 00:00:00 2001
From: "Daniel T. Lee" <danieltimlee@gmail.com>
Date: Sun, 11 Oct 2020 03:17:33 +0900
Subject: [PATCH 49/62] samples: bpf: Replace attach_tracepoint() to attach()
 in xdp_redirect_cpu

>From commit d7a18ea7e8b6 ("libbpf: Add generic bpf_program__attach()"),
for some BPF programs, it is now possible to attach BPF programs
with __attach() instead of explicitly calling __attach_<type>().

This commit refactors the __attach_tracepoint() with libbpf's generic
__attach() method. In addition, this refactors the logic of setting
the map FD to simplify the code. Also, the missing removal of
bpf_load.o in Makefile has been fixed.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201010181734.1109-3-danieltimlee@gmail.com
---
 samples/bpf/Makefile                |   2 +-
 samples/bpf/xdp_redirect_cpu_user.c | 149 +++++++++++++---------------
 2 files changed, 71 insertions(+), 80 deletions(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 50ad7e8b05bc..aeebf5d12f32 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -98,7 +98,7 @@ test_map_in_map-objs := test_map_in_map_user.o
 per_socket_stats_example-objs := cookie_uid_helper_example.o
 xdp_redirect-objs := xdp_redirect_user.o
 xdp_redirect_map-objs := xdp_redirect_map_user.o
-xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o
+xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o
 xdp_monitor-objs := xdp_monitor_user.o
 xdp_rxq_info-objs := xdp_rxq_info_user.o
 syscall_tp-objs := syscall_tp_user.o
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
index 3dd366e9474d..6fb8dbde62c5 100644
--- a/samples/bpf/xdp_redirect_cpu_user.c
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -37,18 +37,35 @@ static __u32 prog_id;
 
 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
 static int n_cpus;
-static int cpu_map_fd;
-static int rx_cnt_map_fd;
-static int redirect_err_cnt_map_fd;
-static int cpumap_enqueue_cnt_map_fd;
-static int cpumap_kthread_cnt_map_fd;
-static int cpus_available_map_fd;
-static int cpus_count_map_fd;
-static int cpus_iterator_map_fd;
-static int exception_cnt_map_fd;
+
+enum map_type {
+	CPU_MAP,
+	RX_CNT,
+	REDIRECT_ERR_CNT,
+	CPUMAP_ENQUEUE_CNT,
+	CPUMAP_KTHREAD_CNT,
+	CPUS_AVAILABLE,
+	CPUS_COUNT,
+	CPUS_ITERATOR,
+	EXCEPTION_CNT,
+};
+
+static const char *const map_type_strings[] = {
+	[CPU_MAP] = "cpu_map",
+	[RX_CNT] = "rx_cnt",
+	[REDIRECT_ERR_CNT] = "redirect_err_cnt",
+	[CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
+	[CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
+	[CPUS_AVAILABLE] = "cpus_available",
+	[CPUS_COUNT] = "cpus_count",
+	[CPUS_ITERATOR] = "cpus_iterator",
+	[EXCEPTION_CNT] = "exception_cnt",
+};
 
 #define NUM_TP 5
-struct bpf_link *tp_links[NUM_TP] = { 0 };
+#define NUM_MAP 9
+struct bpf_link *tp_links[NUM_TP] = {};
+static int map_fds[NUM_MAP];
 static int tp_cnt = 0;
 
 /* Exit return codes */
@@ -527,20 +544,20 @@ static void stats_collect(struct stats_record *rec)
 {
 	int fd, i;
 
-	fd = rx_cnt_map_fd;
+	fd = map_fds[RX_CNT];
 	map_collect_percpu(fd, 0, &rec->rx_cnt);
 
-	fd = redirect_err_cnt_map_fd;
+	fd = map_fds[REDIRECT_ERR_CNT];
 	map_collect_percpu(fd, 1, &rec->redir_err);
 
-	fd = cpumap_enqueue_cnt_map_fd;
+	fd = map_fds[CPUMAP_ENQUEUE_CNT];
 	for (i = 0; i < n_cpus; i++)
 		map_collect_percpu(fd, i, &rec->enq[i]);
 
-	fd = cpumap_kthread_cnt_map_fd;
+	fd = map_fds[CPUMAP_KTHREAD_CNT];
 	map_collect_percpu(fd, 0, &rec->kthread);
 
-	fd = exception_cnt_map_fd;
+	fd = map_fds[EXCEPTION_CNT];
 	map_collect_percpu(fd, 0, &rec->exception);
 }
 
@@ -565,7 +582,7 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
 	/* Add a CPU entry to cpumap, as this allocate a cpu entry in
 	 * the kernel for the cpu.
 	 */
-	ret = bpf_map_update_elem(cpu_map_fd, &cpu, value, 0);
+	ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0);
 	if (ret) {
 		fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
 		exit(EXIT_FAIL_BPF);
@@ -574,21 +591,21 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
 	/* Inform bpf_prog's that a new CPU is available to select
 	 * from via some control maps.
 	 */
-	ret = bpf_map_update_elem(cpus_available_map_fd, &avail_idx, &cpu, 0);
+	ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0);
 	if (ret) {
 		fprintf(stderr, "Add to avail CPUs failed\n");
 		exit(EXIT_FAIL_BPF);
 	}
 
 	/* When not replacing/updating existing entry, bump the count */
-	ret = bpf_map_lookup_elem(cpus_count_map_fd, &key, &curr_cpus_count);
+	ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count);
 	if (ret) {
 		fprintf(stderr, "Failed reading curr cpus_count\n");
 		exit(EXIT_FAIL_BPF);
 	}
 	if (new) {
 		curr_cpus_count++;
-		ret = bpf_map_update_elem(cpus_count_map_fd, &key,
+		ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key,
 					  &curr_cpus_count, 0);
 		if (ret) {
 			fprintf(stderr, "Failed write curr cpus_count\n");
@@ -612,7 +629,7 @@ static void mark_cpus_unavailable(void)
 	int ret, i;
 
 	for (i = 0; i < n_cpus; i++) {
-		ret = bpf_map_update_elem(cpus_available_map_fd, &i,
+		ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i,
 					  &invalid_cpu, 0);
 		if (ret) {
 			fprintf(stderr, "Failed marking CPU unavailable\n");
@@ -665,68 +682,37 @@ static void stats_poll(int interval, bool use_separators, char *prog_name,
 	free_stats_record(prev);
 }
 
-static struct bpf_link * attach_tp(struct bpf_object *obj,
-				   const char *tp_category,
-				   const char* tp_name)
+static int init_tracepoints(struct bpf_object *obj)
 {
 	struct bpf_program *prog;
-	struct bpf_link *link;
-	char sec_name[PATH_MAX];
-	int len;
 
-	len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s",
-		       tp_category, tp_name);
-	if (len < 0)
-		exit(EXIT_FAIL);
+	bpf_object__for_each_program(prog, obj) {
+		if (bpf_program__is_tracepoint(prog) != true)
+			continue;
 
-	prog = bpf_object__find_program_by_title(obj, sec_name);
-	if (!prog) {
-		fprintf(stderr, "ERR: finding progsec: %s\n", sec_name);
-		exit(EXIT_FAIL_BPF);
+		tp_links[tp_cnt] = bpf_program__attach(prog);
+		if (libbpf_get_error(tp_links[tp_cnt])) {
+			tp_links[tp_cnt] = NULL;
+			return -EINVAL;
+		}
+		tp_cnt++;
 	}
 
-	link = bpf_program__attach_tracepoint(prog, tp_category, tp_name);
-	if (libbpf_get_error(link))
-		exit(EXIT_FAIL_BPF);
-
-	return link;
-}
-
-static void init_tracepoints(struct bpf_object *obj) {
-	tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err");
-	tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err");
-	tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception");
-	tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue");
-	tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread");
+	return 0;
 }
 
 static int init_map_fds(struct bpf_object *obj)
 {
-	/* Maps updated by tracepoints */
-	redirect_err_cnt_map_fd =
-		bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt");
-	exception_cnt_map_fd =
-		bpf_object__find_map_fd_by_name(obj, "exception_cnt");
-	cpumap_enqueue_cnt_map_fd =
-		bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt");
-	cpumap_kthread_cnt_map_fd =
-		bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt");
+	enum map_type type;
 
-	/* Maps used by XDP */
-	rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt");
-	cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map");
-	cpus_available_map_fd =
-		bpf_object__find_map_fd_by_name(obj, "cpus_available");
-	cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count");
-	cpus_iterator_map_fd =
-		bpf_object__find_map_fd_by_name(obj, "cpus_iterator");
+	for (type = 0; type < NUM_MAP; type++) {
+		map_fds[type] =
+			bpf_object__find_map_fd_by_name(obj,
+							map_type_strings[type]);
 
-	if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 ||
-	    redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 ||
-	    cpumap_kthread_cnt_map_fd < 0 || cpus_available_map_fd < 0 ||
-	    cpus_count_map_fd < 0 || cpus_iterator_map_fd < 0 ||
-	    exception_cnt_map_fd < 0)
-		return -ENOENT;
+		if (map_fds[type] < 0)
+			return -ENOENT;
+	}
 
 	return 0;
 }
@@ -795,13 +781,13 @@ int main(int argc, char **argv)
 	bool stress_mode = false;
 	struct bpf_program *prog;
 	struct bpf_object *obj;
+	int err = EXIT_FAIL;
 	char filename[256];
 	int added_cpus = 0;
 	int longindex = 0;
 	int interval = 2;
 	int add_cpu = -1;
-	int opt, err;
-	int prog_fd;
+	int opt, prog_fd;
 	int *cpu, i;
 	__u32 qsize;
 
@@ -824,24 +810,29 @@ int main(int argc, char **argv)
 	}
 
 	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
-		return EXIT_FAIL;
+		return err;
 
 	if (prog_fd < 0) {
 		fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n",
 			strerror(errno));
-		return EXIT_FAIL;
+		return err;
 	}
-	init_tracepoints(obj);
+
+	if (init_tracepoints(obj) < 0) {
+		fprintf(stderr, "ERR: bpf_program__attach failed\n");
+		return err;
+	}
+
 	if (init_map_fds(obj) < 0) {
 		fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n");
-		return EXIT_FAIL;
+		return err;
 	}
 	mark_cpus_unavailable();
 
 	cpu = malloc(n_cpus * sizeof(int));
 	if (!cpu) {
 		fprintf(stderr, "failed to allocate cpu array\n");
-		return EXIT_FAIL;
+		return err;
 	}
 	memset(cpu, 0, n_cpus * sizeof(int));
 
@@ -960,14 +951,12 @@ int main(int argc, char **argv)
 	prog = bpf_object__find_program_by_title(obj, prog_name);
 	if (!prog) {
 		fprintf(stderr, "bpf_object__find_program_by_title failed\n");
-		err = EXIT_FAIL;
 		goto out;
 	}
 
 	prog_fd = bpf_program__fd(prog);
 	if (prog_fd < 0) {
 		fprintf(stderr, "bpf_program__fd failed\n");
-		err = EXIT_FAIL;
 		goto out;
 	}
 
@@ -986,6 +975,8 @@ int main(int argc, char **argv)
 
 	stats_poll(interval, use_separators, prog_name, mprog_name,
 		   &value, stress_mode);
+
+	err = EXIT_OK;
 out:
 	free(cpu);
 	return err;

From 321f6324500eb2ec75b6fcff7dcd66d64ba18529 Mon Sep 17 00:00:00 2001
From: "Daniel T. Lee" <danieltimlee@gmail.com>
Date: Sun, 11 Oct 2020 03:17:34 +0900
Subject: [PATCH 50/62] samples: bpf: Refactor XDP kern program maps with
 BTF-defined map

Most of the samples were converted to use the new BTF-defined MAP as
they moved to libbpf, but some of the samples were missing.

Instead of using the previous BPF MAP definition, this commit refactors
xdp_monitor and xdp_sample_pkts_kern MAP definition with the new
BTF-defined MAP format.

Also, this commit removes the max_entries attribute at PERF_EVENT_ARRAY
map type. The libbpf's bpf_object__create_map() will automatically
set max_entries to the maximum configured number of CPUs on the host.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201010181734.1109-4-danieltimlee@gmail.com
---
 samples/bpf/xdp_monitor_kern.c     | 60 +++++++++++++++---------------
 samples/bpf/xdp_sample_pkts_kern.c | 12 +++---
 samples/bpf/xdp_sample_pkts_user.c |  1 -
 3 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 3d33cca2d48a..5c955b812c47 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -6,21 +6,21 @@
 #include <uapi/linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
-struct bpf_map_def SEC("maps") redirect_err_cnt = {
-	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
-	.key_size = sizeof(u32),
-	.value_size = sizeof(u64),
-	.max_entries = 2,
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, 2);
 	/* TODO: have entries for all possible errno's */
-};
+} redirect_err_cnt SEC(".maps");
 
 #define XDP_UNKNOWN	XDP_REDIRECT + 1
-struct bpf_map_def SEC("maps") exception_cnt = {
-	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
-	.key_size	= sizeof(u32),
-	.value_size	= sizeof(u64),
-	.max_entries	= XDP_UNKNOWN + 1,
-};
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, XDP_UNKNOWN + 1);
+} exception_cnt SEC(".maps");
 
 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
  * Code in:                kernel/include/trace/events/xdp.h
@@ -129,19 +129,19 @@ struct datarec {
 };
 #define MAX_CPUS 64
 
-struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
-	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
-	.key_size	= sizeof(u32),
-	.value_size	= sizeof(struct datarec),
-	.max_entries	= MAX_CPUS,
-};
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, struct datarec);
+	__uint(max_entries, MAX_CPUS);
+} cpumap_enqueue_cnt SEC(".maps");
 
-struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
-	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
-	.key_size	= sizeof(u32),
-	.value_size	= sizeof(struct datarec),
-	.max_entries	= 1,
-};
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, struct datarec);
+	__uint(max_entries, 1);
+} cpumap_kthread_cnt SEC(".maps");
 
 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
  * Code in:         kernel/include/trace/events/xdp.h
@@ -210,12 +210,12 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
 	return 0;
 }
 
-struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
-	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
-	.key_size	= sizeof(u32),
-	.value_size	= sizeof(struct datarec),
-	.max_entries	= 1,
-};
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, struct datarec);
+	__uint(max_entries, 1);
+} devmap_xmit_cnt SEC(".maps");
 
 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
  * Code in:         kernel/include/trace/events/xdp.h
diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c
index 33377289e2a8..9cf76b340dd7 100644
--- a/samples/bpf/xdp_sample_pkts_kern.c
+++ b/samples/bpf/xdp_sample_pkts_kern.c
@@ -5,14 +5,12 @@
 #include <bpf/bpf_helpers.h>
 
 #define SAMPLE_SIZE 64ul
-#define MAX_CPUS 128
 
-struct bpf_map_def SEC("maps") my_map = {
-	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(u32),
-	.max_entries = MAX_CPUS,
-};
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(u32));
+} my_map SEC(".maps");
 
 SEC("xdp_sample")
 int xdp_sample_prog(struct xdp_md *ctx)
diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c
index 991ef6f0880b..4b2a300c750c 100644
--- a/samples/bpf/xdp_sample_pkts_user.c
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -18,7 +18,6 @@
 
 #include "perf-sys.h"
 
-#define MAX_CPUS 128
 static int if_idx;
 static char *if_name;
 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;

From d1c362e1dd68a421cf9033404cf141a4ab734a5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Fri, 9 Oct 2020 20:42:34 +0200
Subject: [PATCH 51/62] bpf: Always return target ifindex in bpf_fib_lookup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bpf_fib_lookup() helper performs a neighbour lookup for the destination
IP and returns BPF_FIB_LKUP_NO_NEIGH if this fails, with the expectation
that the BPF program will pass the packet up the stack in this case.
However, with the addition of bpf_redirect_neigh() that can be used instead
to perform the neighbour lookup, at the cost of a bit of duplicated work.

For that we still need the target ifindex, and since bpf_fib_lookup()
already has that at the time it performs the neighbour lookup, there is
really no reason why it can't just return it in any case. So let's just
always return the ifindex if the FIB lookup itself succeeds.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Link: https://lore.kernel.org/bpf/20201009184234.134214-1-toke@redhat.com
---
 net/core/filter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index fab951c6be57..96ec4cbbf4e0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5234,7 +5234,6 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
 	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
 	params->h_vlan_TCI = 0;
 	params->h_vlan_proto = 0;
-	params->ifindex = dev->ifindex;
 
 	return 0;
 }
@@ -5331,6 +5330,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	dev = nhc->nhc_dev;
 
 	params->rt_metric = res.fi->fib_priority;
+	params->ifindex = dev->ifindex;
 
 	/* xdp and cls_bpf programs are run in RCU-bh so
 	 * rcu_read_lock_bh is not needed here
@@ -5456,6 +5456,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 	dev = res.nh->fib_nh_dev;
 	params->rt_metric = res.f6i->fib6_metric;
+	params->ifindex = dev->ifindex;
 
 	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
 	 * not needed here.

From ebb034b15bfa5182c78939dca88022d76afedf24 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sun, 11 Oct 2020 13:01:49 -0700
Subject: [PATCH 52/62] bpf: Migrate from patchwork.ozlabs.org to
 patchwork.kernel.org.

Move the bpf/bpf-next patch processing queue to patchwork.kernel.org.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20201011200149.66537-1-alexei.starovoitov@gmail.com
---
 Documentation/bpf/bpf_devel_QA.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst
index 75a0dca5f295..5b613d2a5f1a 100644
--- a/Documentation/bpf/bpf_devel_QA.rst
+++ b/Documentation/bpf/bpf_devel_QA.rst
@@ -60,13 +60,13 @@ Q: Where can I find patches currently under discussion for BPF subsystem?
 A: All patches that are Cc'ed to netdev are queued for review under netdev
 patchwork project:
 
-  http://patchwork.ozlabs.org/project/netdev/list/
+  https://patchwork.kernel.org/project/netdevbpf/list/
 
 Those patches which target BPF, are assigned to a 'bpf' delegate for
 further processing from BPF maintainers. The current queue with
 patches under review can be found at:
 
-  https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
+  https://patchwork.kernel.org/project/netdevbpf/list/?delegate=121173
 
 Once the patches have been reviewed by the BPF community as a whole
 and approved by the BPF maintainers, their status in patchwork will be

From cfea28f890cf292d5fe90680db64b68086ef25ba Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 9 Oct 2020 11:36:16 -0700
Subject: [PATCH 53/62] bpf, sockmap: Skb verdict SK_PASS to self already
 checked rmem limits

For sk_skb case where skb_verdict program returns SK_PASS to continue to
pass packet up the stack, the memory limits were already checked before
enqueuing in skb_queue_tail from TCP side. So, lets remove the extra checks
here. The theory is if the TCP stack believes we have memory to receive
the packet then lets trust the stack and not double check the limits.

In fact the accounting here can cause a drop if sk_rmem_alloc has increased
after the stack accepted this packet, but before the duplicate check here.
And worse if this happens because TCP stack already believes the data has
been received there is no retransmit.

Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160226857664.5692.668205469388498375.stgit@john-Precision-5820-Tower
---
 net/core/skmsg.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 4b5f7c8fecd1..040ae1d75b65 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -771,6 +771,7 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
 static void sk_psock_verdict_apply(struct sk_psock *psock,
 				   struct sk_buff *skb, int verdict)
 {
+	struct tcp_skb_cb *tcp;
 	struct sock *sk_other;
 
 	switch (verdict) {
@@ -780,16 +781,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
 		    !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
 			goto out_free;
 		}
-		if (atomic_read(&sk_other->sk_rmem_alloc) <=
-		    sk_other->sk_rcvbuf) {
-			struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
 
-			tcp->bpf.flags |= BPF_F_INGRESS;
-			skb_queue_tail(&psock->ingress_skb, skb);
-			schedule_work(&psock->work);
-			break;
-		}
-		goto out_free;
+		tcp = TCP_SKB_CB(skb);
+		tcp->bpf.flags |= BPF_F_INGRESS;
+		skb_queue_tail(&psock->ingress_skb, skb);
+		schedule_work(&psock->work);
+		break;
 	case __SK_REDIRECT:
 		sk_psock_skb_redirect(skb);
 		break;

From 9ecbfb06a078c4911fb444203e8e41d93d22f886 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 9 Oct 2020 11:36:37 -0700
Subject: [PATCH 54/62] bpf, sockmap: On receive programs try to fast track
 SK_PASS ingress

When we receive an skb and the ingress skb verdict program returns
SK_PASS we currently set the ingress flag and put it on the workqueue
so it can be turned into a sk_msg and put on the sk_msg ingress queue.
Then finally telling userspace with data_ready hook.

Here we observe that if the workqueue is empty then we can try to
convert into a sk_msg type and call data_ready directly without
bouncing through a workqueue. Its a common pattern to have a recv
verdict program for visibility that always returns SK_PASS. In this
case unless there is an ENOMEM error or we overrun the socket we
can avoid the workqueue completely only using it when we fall back
to error cases caused by memory pressure.

By doing this we eliminate another case where data may be dropped
if errors occur on memory limits in workqueue.

Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160226859704.5692.12929678876744977669.stgit@john-Precision-5820-Tower
---
 net/core/skmsg.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 040ae1d75b65..4b160d97b7f9 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -773,6 +773,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
 {
 	struct tcp_skb_cb *tcp;
 	struct sock *sk_other;
+	int err = -EIO;
 
 	switch (verdict) {
 	case __SK_PASS:
@@ -784,8 +785,20 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
 
 		tcp = TCP_SKB_CB(skb);
 		tcp->bpf.flags |= BPF_F_INGRESS;
-		skb_queue_tail(&psock->ingress_skb, skb);
-		schedule_work(&psock->work);
+
+		/* If the queue is empty then we can submit directly
+		 * into the msg queue. If its not empty we have to
+		 * queue work otherwise we may get OOO data. Otherwise,
+		 * if sk_psock_skb_ingress errors will be handled by
+		 * retrying later from workqueue.
+		 */
+		if (skb_queue_empty(&psock->ingress_skb)) {
+			err = sk_psock_skb_ingress(psock, skb);
+		}
+		if (err < 0) {
+			skb_queue_tail(&psock->ingress_skb, skb);
+			schedule_work(&psock->work);
+		}
 		break;
 	case __SK_REDIRECT:
 		sk_psock_skb_redirect(skb);

From 29545f4977cf7c7b721aa4d43418632af1d6836d Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 9 Oct 2020 11:36:57 -0700
Subject: [PATCH 55/62] bpf, sockmap: Remove skb_set_owner_w wmem will be taken
 later from sendpage

The skb_set_owner_w is unnecessary here. The sendpage call will create a
fresh skb and set the owner correctly from workqueue. Its also not entirely
harmless because it consumes cycles, but also impacts resource accounting
by increasing sk_wmem_alloc. This is charging the socket we are going to
send to for the skb, but we will put it on the workqueue for some time
before this happens so we are artifically inflating sk_wmem_alloc for
this period. Further, we don't know how many skbs will be used to send the
packet or how it will be broken up when sent over the new socket so
charging it with one big sum is also not correct when the workqueue may
break it up if facing memory pressure. Seeing we don't know how/when
this is going to be sent drop the early accounting.

A later patch will do proper accounting charged on receive socket for
the case where skbs get enqueued on the workqueue.

Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160226861708.5692.17964237936462425136.stgit@john-Precision-5820-Tower
---
 net/core/skmsg.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 4b160d97b7f9..7389d5d7e7f8 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -728,8 +728,6 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
 	    (ingress &&
 	     atomic_read(&sk_other->sk_rmem_alloc) <=
 	     sk_other->sk_rcvbuf)) {
-		if (!ingress)
-			skb_set_owner_w(skb, sk_other);
 		skb_queue_tail(&psock_other->ingress_skb, skb);
 		schedule_work(&psock_other->work);
 	} else {

From 9047f19e7ccba5073d983dd3882b5f46086b578a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 9 Oct 2020 11:37:17 -0700
Subject: [PATCH 56/62] bpf, sockmap: Remove dropped data on errors in redirect
 case

In the sk_skb redirect case we didn't handle the case where we overrun
the sk_rmem_alloc entry on ingress redirect or sk_wmem_alloc on egress.
Because we didn't have anything implemented we simply dropped the skb.
This meant data could be dropped if socket memory accounting was in
place.

This fixes the above dropped data case by moving the memory checks
later in the code where we actually do the send or recv. This pushes
those checks into the workqueue and allows us to return an EAGAIN error
which in turn allows us to try again later from the workqueue.

Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160226863689.5692.13861422742592309285.stgit@john-Precision-5820-Tower
---
 net/core/skmsg.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 7389d5d7e7f8..880b84baab5e 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -433,10 +433,12 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
 static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 			       u32 off, u32 len, bool ingress)
 {
-	if (ingress)
-		return sk_psock_skb_ingress(psock, skb);
-	else
+	if (!ingress) {
+		if (!sock_writeable(psock->sk))
+			return -EAGAIN;
 		return skb_send_sock_locked(psock->sk, skb, off, len);
+	}
+	return sk_psock_skb_ingress(psock, skb);
 }
 
 static void sk_psock_backlog(struct work_struct *work)
@@ -709,30 +711,28 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
 {
 	struct sk_psock *psock_other;
 	struct sock *sk_other;
-	bool ingress;
 
 	sk_other = tcp_skb_bpf_redirect_fetch(skb);
+	/* This error is a buggy BPF program, it returned a redirect
+	 * return code, but then didn't set a redirect interface.
+	 */
 	if (unlikely(!sk_other)) {
 		kfree_skb(skb);
 		return;
 	}
 	psock_other = sk_psock(sk_other);
+	/* This error indicates the socket is being torn down or had another
+	 * error that caused the pipe to break. We can't send a packet on
+	 * a socket that is in this state so we drop the skb.
+	 */
 	if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
 	    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
 		kfree_skb(skb);
 		return;
 	}
 
-	ingress = tcp_skb_bpf_ingress(skb);
-	if ((!ingress && sock_writeable(sk_other)) ||
-	    (ingress &&
-	     atomic_read(&sk_other->sk_rmem_alloc) <=
-	     sk_other->sk_rcvbuf)) {
-		skb_queue_tail(&psock_other->ingress_skb, skb);
-		schedule_work(&psock_other->work);
-	} else {
-		kfree_skb(skb);
-	}
+	skb_queue_tail(&psock_other->ingress_skb, skb);
+	schedule_work(&psock_other->work);
 }
 
 static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)

From 10d58d006356a075a7b056e0f6502db416d1a261 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 9 Oct 2020 11:37:35 -0700
Subject: [PATCH 57/62] bpf, sockmap: Remove skb_orphan and let normal
 skb_kfree do cleanup

Calling skb_orphan() is unnecessary in the strp rcv handler because the skb
is from a skb_clone() in __strp_recv. So it never has a destructor or a
sk assigned. Plus its confusing to read because it might hint to the reader
that the skb could have an sk assigned which is not true. Even if we did
have an sk assigned it would be cleaner to simply wait for the upcoming
kfree_skb().

Additionally, move the comment about strparser clone up so its closer to
the logic it is describing and add to it so that it is more complete.

Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160226865548.5692.9098315689984599579.stgit@john-Precision-5820-Tower
---
 net/core/skmsg.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 880b84baab5e..3e78f2a80747 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -686,15 +686,16 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
 {
 	int ret;
 
+	/* strparser clones the skb before handing it to a upper layer,
+	 * meaning we have the same data, but sk is NULL. We do want an
+	 * sk pointer though when we run the BPF program. So we set it
+	 * here and then NULL it to ensure we don't trigger a BUG_ON()
+	 * in skb/sk operations later if kfree_skb is called with a
+	 * valid skb->sk pointer and no destructor assigned.
+	 */
 	skb->sk = psock->sk;
 	bpf_compute_data_end_sk_skb(skb);
 	ret = bpf_prog_run_pin_on_cpu(prog, skb);
-	/* strparser clones the skb before handing it to a upper layer,
-	 * meaning skb_orphan has been called. We NULL sk on the way out
-	 * to ensure we don't trigger a BUG_ON() in skb/sk operations
-	 * later and because we are not charging the memory of this skb
-	 * to any socket yet.
-	 */
 	skb->sk = NULL;
 	return ret;
 }
@@ -824,7 +825,6 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
 	}
 	prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
-		skb_orphan(skb);
 		tcp_skb_bpf_redirect_clear(skb);
 		ret = sk_psock_bpf_run(psock, prog, skb);
 		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));

From 0b17ad25d8d102ffb83dcc99ebd24719194ebf4f Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 9 Oct 2020 11:37:55 -0700
Subject: [PATCH 58/62] bpf, sockmap: Add memory accounting so skbs on ingress
 lists are visible

Move skb->sk assignment out of sk_psock_bpf_run() and into individual
callers. Then we can use proper skb_set_owner_r() call to assign a
sk to a skb. This improves things by also charging the truesize against
the sockets sk_rmem_alloc counter. With this done we get some accounting
in place to ensure the memory associated with skbs on the workqueue are
still being accounted for somewhere. Finally, by using skb_set_owner_r
the destructor is setup so we can just let the normal skb_kfree logic
recover the memory. Combined with previous patch dropping skb_orphan()
we now can recover from memory pressure and maintain accounting.

Note, we will charge the skbs against their originating socket even
if being redirected into another socket. Once the skb completes the
redirect op the kfree_skb will give the memory back. This is important
because if we charged the socket we are redirecting to (like it was
done before this series) the sock_writeable() test could fail because
of the skb trying to be sent is already charged against the socket.

Also TLS case is special. Here we wait until we have decided not to
simply PASS the packet up the stack. In the case where we PASS the
packet up the stack we already have an skb which is accounted for on
the TLS socket context.

For the parser case we continue to just set/clear skb->sk this is
because the skb being used here may be combined with other skbs or
turned into multiple skbs depending on the parser logic. For example
the parser could request a payload length greater than skb->len so
that the strparser needs to collect multiple skbs. At any rate
the final result will be handled in the strparser recv callback.

Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160226867513.5692.10579573214635925960.stgit@john-Precision-5820-Tower
---
 net/core/skmsg.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 3e78f2a80747..881a5b290946 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -684,20 +684,8 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
 static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
 			    struct sk_buff *skb)
 {
-	int ret;
-
-	/* strparser clones the skb before handing it to a upper layer,
-	 * meaning we have the same data, but sk is NULL. We do want an
-	 * sk pointer though when we run the BPF program. So we set it
-	 * here and then NULL it to ensure we don't trigger a BUG_ON()
-	 * in skb/sk operations later if kfree_skb is called with a
-	 * valid skb->sk pointer and no destructor assigned.
-	 */
-	skb->sk = psock->sk;
 	bpf_compute_data_end_sk_skb(skb);
-	ret = bpf_prog_run_pin_on_cpu(prog, skb);
-	skb->sk = NULL;
-	return ret;
+	return bpf_prog_run_pin_on_cpu(prog, skb);
 }
 
 static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
@@ -736,10 +724,11 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
 	schedule_work(&psock_other->work);
 }
 
-static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)
+static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
 {
 	switch (verdict) {
 	case __SK_REDIRECT:
+		skb_set_owner_r(skb, sk);
 		sk_psock_skb_redirect(skb);
 		break;
 	case __SK_PASS:
@@ -757,11 +746,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
 	rcu_read_lock();
 	prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
+		/* We skip full set_owner_r here because if we do a SK_PASS
+		 * or SK_DROP we can skip skb memory accounting and use the
+		 * TLS context.
+		 */
+		skb->sk = psock->sk;
 		tcp_skb_bpf_redirect_clear(skb);
 		ret = sk_psock_bpf_run(psock, prog, skb);
 		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+		skb->sk = NULL;
 	}
-	sk_psock_tls_verdict_apply(skb, ret);
+	sk_psock_tls_verdict_apply(skb, psock->sk, ret);
 	rcu_read_unlock();
 	return ret;
 }
@@ -823,6 +818,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
 		kfree_skb(skb);
 		goto out;
 	}
+	skb_set_owner_r(skb, sk);
 	prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
 		tcp_skb_bpf_redirect_clear(skb);
@@ -847,8 +843,11 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
 
 	rcu_read_lock();
 	prog = READ_ONCE(psock->progs.skb_parser);
-	if (likely(prog))
+	if (likely(prog)) {
+		skb->sk = psock->sk;
 		ret = sk_psock_bpf_run(psock, prog, skb);
+		skb->sk = NULL;
+	}
 	rcu_read_unlock();
 	return ret;
 }

From 743df8b7749fb5a289fc0c7ac94ec15533596839 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 10 Oct 2020 22:09:07 -0700
Subject: [PATCH 59/62] bpf, sockmap: Check skb_verdict and skb_parser programs
 explicitly

We are about to allow skb_verdict to run without skb_parser programs
as a first step change code to check each program type specifically.
This should be a mechanical change without any impact to actual result.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160239294756.8495.5796595770890272219.stgit@john-Precision-5820-Tower
---
 net/core/sock_map.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index e83a80e8f13b..a2ed5b6223b9 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -230,16 +230,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 {
 	struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
 	struct sk_psock *psock;
-	bool skb_progs;
 	int ret;
 
 	skb_verdict = READ_ONCE(progs->skb_verdict);
 	skb_parser = READ_ONCE(progs->skb_parser);
-	skb_progs = skb_parser && skb_verdict;
-	if (skb_progs) {
+	if (skb_verdict) {
 		skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
 		if (IS_ERR(skb_verdict))
 			return PTR_ERR(skb_verdict);
+	}
+	if (skb_parser) {
 		skb_parser = bpf_prog_inc_not_zero(skb_parser);
 		if (IS_ERR(skb_parser)) {
 			bpf_prog_put(skb_verdict);
@@ -264,7 +264,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 
 	if (psock) {
 		if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
-		    (skb_progs  && READ_ONCE(psock->progs.skb_parser))) {
+		    (skb_parser  && READ_ONCE(psock->progs.skb_parser)) ||
+		    (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
 			sk_psock_put(sk, psock);
 			ret = -EBUSY;
 			goto out_progs;
@@ -285,7 +286,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 		goto out_drop;
 
 	write_lock_bh(&sk->sk_callback_lock);
-	if (skb_progs && !psock->parser.enabled) {
+	if (skb_parser && skb_verdict && !psock->parser.enabled) {
 		ret = sk_psock_init_strp(sk, psock);
 		if (ret) {
 			write_unlock_bh(&sk->sk_callback_lock);
@@ -303,10 +304,10 @@ out_progs:
 	if (msg_parser)
 		bpf_prog_put(msg_parser);
 out:
-	if (skb_progs) {
+	if (skb_verdict)
 		bpf_prog_put(skb_verdict);
+	if (skb_parser)
 		bpf_prog_put(skb_parser);
-	}
 	return ret;
 }
 

From ef5659280eb13e8ac31c296f58cfdfa1684ac06b Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 10 Oct 2020 22:09:38 -0700
Subject: [PATCH 60/62] bpf, sockmap: Allow skipping sk_skb parser program

Currently, we often run with a nop parser namely one that just does
this, 'return skb->len'. This happens when either our verdict program
can handle streaming data or it is only looking at socket data such
as IP addresses and other metadata associated with the flow. The second
case is common for a L3/L4 proxy for instance.

So lets allow loading programs without the parser then we can skip
the stream parser logic and avoid having to add a BPF program that
is effectively a nop.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160239297866.8495.13345662302749219672.stgit@john-Precision-5820-Tower
---
 include/linux/skmsg.h |  2 ++
 net/core/skmsg.c      | 78 +++++++++++++++++++++++++++++++++++++++++++
 net/core/sock_map.c   | 22 ++++++++----
 3 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 3119928fc103..fec0c5ac1c4f 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -308,6 +308,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node);
 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
 void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock);
+void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock);
 
 int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
 			 struct sk_msg *msg);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 881a5b290946..654182ecf87b 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -627,6 +627,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 	rcu_assign_sk_user_data(sk, NULL);
 	if (psock->progs.skb_parser)
 		sk_psock_stop_strp(sk, psock);
+	else if (psock->progs.skb_verdict)
+		sk_psock_stop_verdict(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
 	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
 
@@ -871,6 +873,57 @@ static void sk_psock_strp_data_ready(struct sock *sk)
 	rcu_read_unlock();
 }
 
+static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
+				 unsigned int offset, size_t orig_len)
+{
+	struct sock *sk = (struct sock *)desc->arg.data;
+	struct sk_psock *psock;
+	struct bpf_prog *prog;
+	int ret = __SK_DROP;
+	int len = skb->len;
+
+	/* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb) {
+		desc->error = -ENOMEM;
+		return 0;
+	}
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (unlikely(!psock)) {
+		len = 0;
+		kfree_skb(skb);
+		goto out;
+	}
+	skb_set_owner_r(skb, sk);
+	prog = READ_ONCE(psock->progs.skb_verdict);
+	if (likely(prog)) {
+		tcp_skb_bpf_redirect_clear(skb);
+		ret = sk_psock_bpf_run(psock, prog, skb);
+		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+	}
+	sk_psock_verdict_apply(psock, skb, ret);
+out:
+	rcu_read_unlock();
+	return len;
+}
+
+static void sk_psock_verdict_data_ready(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+	read_descriptor_t desc;
+
+	if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
+		return;
+
+	desc.arg.data = sk;
+	desc.error = 0;
+	desc.count = 1;
+
+	sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
+}
+
 static void sk_psock_write_space(struct sock *sk)
 {
 	struct sk_psock *psock;
@@ -900,6 +953,19 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
 	return strp_init(&psock->parser.strp, sk, &cb);
 }
 
+void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_parser *parser = &psock->parser;
+
+	if (parser->enabled)
+		return;
+
+	parser->saved_data_ready = sk->sk_data_ready;
+	sk->sk_data_ready = sk_psock_verdict_data_ready;
+	sk->sk_write_space = sk_psock_write_space;
+	parser->enabled = true;
+}
+
 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
 {
 	struct sk_psock_parser *parser = &psock->parser;
@@ -925,3 +991,15 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
 	strp_stop(&parser->strp);
 	parser->enabled = false;
 }
+
+void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_parser *parser = &psock->parser;
+
+	if (!parser->enabled)
+		return;
+
+	sk->sk_data_ready = parser->saved_data_ready;
+	parser->saved_data_ready = NULL;
+	parser->enabled = false;
+}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index a2ed5b6223b9..df09c39a4dd2 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -148,8 +148,8 @@ static void sock_map_add_link(struct sk_psock *psock,
 static void sock_map_del_link(struct sock *sk,
 			      struct sk_psock *psock, void *link_raw)
 {
+	bool strp_stop = false, verdict_stop = false;
 	struct sk_psock_link *link, *tmp;
-	bool strp_stop = false;
 
 	spin_lock_bh(&psock->link_lock);
 	list_for_each_entry_safe(link, tmp, &psock->link, list) {
@@ -159,14 +159,19 @@ static void sock_map_del_link(struct sock *sk,
 							     map);
 			if (psock->parser.enabled && stab->progs.skb_parser)
 				strp_stop = true;
+			if (psock->parser.enabled && stab->progs.skb_verdict)
+				verdict_stop = true;
 			list_del(&link->list);
 			sk_psock_free_link(link);
 		}
 	}
 	spin_unlock_bh(&psock->link_lock);
-	if (strp_stop) {
+	if (strp_stop || verdict_stop) {
 		write_lock_bh(&sk->sk_callback_lock);
-		sk_psock_stop_strp(sk, psock);
+		if (strp_stop)
+			sk_psock_stop_strp(sk, psock);
+		else
+			sk_psock_stop_verdict(sk, psock);
 		write_unlock_bh(&sk->sk_callback_lock);
 	}
 }
@@ -288,16 +293,19 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 	write_lock_bh(&sk->sk_callback_lock);
 	if (skb_parser && skb_verdict && !psock->parser.enabled) {
 		ret = sk_psock_init_strp(sk, psock);
-		if (ret) {
-			write_unlock_bh(&sk->sk_callback_lock);
-			goto out_drop;
-		}
+		if (ret)
+			goto out_unlock_drop;
 		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
 		psock_set_prog(&psock->progs.skb_parser, skb_parser);
 		sk_psock_start_strp(sk, psock);
+	} else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
+		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+		sk_psock_start_verdict(sk,psock);
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
 	return 0;
+out_unlock_drop:
+	write_unlock_bh(&sk->sk_callback_lock);
 out_drop:
 	sk_psock_put(sk, psock);
 out_progs:

From cdf43c4bfa1a465ba7996165b75c49b5c0e46b50 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 10 Oct 2020 22:10:04 -0700
Subject: [PATCH 61/62] bpf, selftests: Add option to test_sockmap to omit
 adding parser program

Add option to allow running without a parser program in place. To test
with ping/pong program use,

 # test_sockmap -t ping --txmsg_omit_skb_parser

this will send packets between two socket bouncing through a proxy
socket that does not use a parser program.

   (ping)                                    (pong)
   sender         proxy_recv proxy_send      recv
     |                |                       |
     |              verdict -----+            |
     |                |          |            |
     +----------------+          +------------+

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160239300387.8495.11908295143121563076.stgit@john-Precision-5820-Tower
---
 tools/testing/selftests/bpf/test_sockmap.c | 35 +++++++++++++---------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 5cf45455de42..419c0b010d14 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -86,6 +86,7 @@ int txmsg_ktls_skb_redir;
 int ktls;
 int peek_flag;
 int skb_use_parser;
+int txmsg_omit_skb_parser;
 
 static const struct option long_options[] = {
 	{"help",	no_argument,		NULL, 'h' },
@@ -111,6 +112,7 @@ static const struct option long_options[] = {
 	{"txmsg_redir_skb", no_argument,	&txmsg_redir_skb, 1 },
 	{"ktls", no_argument,			&ktls, 1 },
 	{"peek", no_argument,			&peek_flag, 1 },
+	{"txmsg_omit_skb_parser", no_argument,      &txmsg_omit_skb_parser, 1},
 	{"whitelist", required_argument,	NULL, 'n' },
 	{"blacklist", required_argument,	NULL, 'b' },
 	{0, 0, NULL, 0 }
@@ -175,6 +177,7 @@ static void test_reset(void)
 	txmsg_apply = txmsg_cork = 0;
 	txmsg_ingress = txmsg_redir_skb = 0;
 	txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0;
+	txmsg_omit_skb_parser = 0;
 	skb_use_parser = 0;
 }
 
@@ -912,13 +915,15 @@ static int run_options(struct sockmap_options *options, int cg_fd,  int test)
 		goto run;
 
 	/* Attach programs to sockmap */
-	err = bpf_prog_attach(prog_fd[0], map_fd[0],
-				BPF_SK_SKB_STREAM_PARSER, 0);
-	if (err) {
-		fprintf(stderr,
-			"ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n",
-			prog_fd[0], map_fd[0], err, strerror(errno));
-		return err;
+	if (!txmsg_omit_skb_parser) {
+		err = bpf_prog_attach(prog_fd[0], map_fd[0],
+				      BPF_SK_SKB_STREAM_PARSER, 0);
+		if (err) {
+			fprintf(stderr,
+				"ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n",
+				prog_fd[0], map_fd[0], err, strerror(errno));
+			return err;
+		}
 	}
 
 	err = bpf_prog_attach(prog_fd[1], map_fd[0],
@@ -931,13 +936,15 @@ static int run_options(struct sockmap_options *options, int cg_fd,  int test)
 
 	/* Attach programs to TLS sockmap */
 	if (txmsg_ktls_skb) {
-		err = bpf_prog_attach(prog_fd[0], map_fd[8],
-					BPF_SK_SKB_STREAM_PARSER, 0);
-		if (err) {
-			fprintf(stderr,
-				"ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n",
-				prog_fd[0], map_fd[8], err, strerror(errno));
-			return err;
+		if (!txmsg_omit_skb_parser) {
+			err = bpf_prog_attach(prog_fd[0], map_fd[8],
+					      BPF_SK_SKB_STREAM_PARSER, 0);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n",
+					prog_fd[0], map_fd[8], err, strerror(errno));
+				return err;
+			}
 		}
 
 		err = bpf_prog_attach(prog_fd[2], map_fd[8],

From a24fb420a5775581050026eba2264c87927debf7 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 10 Oct 2020 22:10:26 -0700
Subject: [PATCH 62/62] bpf, selftests: Add three new sockmap tests for verdict
 only programs

Here we add three new tests for sockmap to test having a verdict program
without setting the parser program.

The first test covers the most simply case,

   sender         proxy_recv proxy_send      recv
     |                |                       |
     |              verdict -----+            |
     |                |          |            |
     +----------------+          +------------+

We load the verdict program on the proxy_recv socket without a
parser program. It then does a redirect into the send path of the
proxy_send socket using sendpage_locked().

Next we test the drop case to ensure if we kfree_skb as a result of
the verdict program everything behaves as expected.

Next we test the same configuration above, but with ktls and a
redirect into socket ingress queue. Shown here

   tls                                       tls
   sender         proxy_recv proxy_send      recv
     |                |                       |
     |              verdict ------------------+
     |                |      redirect_ingress
     +----------------+

Also to set up ping/pong test

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160239302638.8495.17125996694402793471.stgit@john-Precision-5820-Tower
---
 tools/testing/selftests/bpf/test_sockmap.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 419c0b010d14..0fa1e421c3d7 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1472,12 +1472,29 @@ static void test_txmsg_skb(int cgrp, struct sockmap_options *opt)
 	txmsg_ktls_skb_drop = 0;
 	txmsg_ktls_skb_redir = 1;
 	test_exec(cgrp, opt);
+	txmsg_ktls_skb_redir = 0;
+
+	/* Tests that omit skb_parser */
+	txmsg_omit_skb_parser = 1;
+	ktls = 0;
+	txmsg_ktls_skb = 0;
+	test_exec(cgrp, opt);
+
+	txmsg_ktls_skb_drop = 1;
+	test_exec(cgrp, opt);
+	txmsg_ktls_skb_drop = 0;
+
+	txmsg_ktls_skb_redir = 1;
+	test_exec(cgrp, opt);
+
+	ktls = 1;
+	test_exec(cgrp, opt);
+	txmsg_omit_skb_parser = 0;
 
 	opt->data_test = data;
 	ktls = k;
 }
 
-
 /* Test cork with hung data. This tests poor usage patterns where
  * cork can leave data on the ring if user program is buggy and
  * doesn't flush them somehow. They do take some time however