From 345e004d023343d38088fdfea39688aa11e06ccf Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Fri, 10 Dec 2021 00:46:31 +0100
Subject: [PATCH 01/15] bpf: Fix incorrect state pruning for <8B spill/fill

Commit 354e8f1970f8 ("bpf: Support <8-byte scalar spill and refill")
introduced support in the verifier to track <8B spill/fills of scalars.
The backtracking logic for the precision bit was however skipping
spill/fills of less than 8B. That could cause state pruning to consider
two states equivalent when they shouldn't be.

As an example, consider the following bytecode snippet:

  0:  r7 = r1
  1:  call bpf_get_prandom_u32
  2:  r6 = 2
  3:  if r0 == 0 goto pc+1
  4:  r6 = 3
  ...
  8: [state pruning point]
  ...
  /* u32 spill/fill */
  10: *(u32 *)(r10 - 8) = r6
  11: r8 = *(u32 *)(r10 - 8)
  12: r0 = 0
  13: if r8 == 3 goto pc+1
  14: r0 = 1
  15: exit

The verifier first walks the path with R6=3. Given the support for <8B
spill/fills, at instruction 13, it knows the condition is true and skips
instruction 14. At that point, the backtracking logic kicks in but stops
at the fill instruction since it only propagates the precision bit for
8B spill/fill. When the verifier then walks the path with R6=2, it will
consider it safe at instruction 8 because R6 is not marked as needing
precision. Instruction 14 is thus never walked and is then incorrectly
removed as 'dead code'.

It's also possible to lead the verifier to accept e.g. an out-of-bound
memory access instead of causing an incorrect dead code elimination.

This regression was found via Cilium's bpf-next CI where it was causing
a conntrack map update to be silently skipped because the code had been
removed by the verifier.

This commit fixes it by enabling support for <8B spill/fills in the
bactracking logic. In case of a <8B spill/fill, the full 8B stack slot
will be marked as needing precision. Then, in __mark_chain_precision,
any tracked register spilled in a marked slot will itself be marked as
needing precision, regardless of the spill size. This logic makes two
assumptions: (1) only 8B-aligned spill/fill are tracked and (2) spilled
registers are only tracked if the spill and fill sizes are equal. Commit
ef979017b837 ("bpf: selftest: Add verifier tests for <8-byte scalar
spill and refill") covers the first assumption and the next commit in
this patchset covers the second.

Fixes: 354e8f1970f8 ("bpf: Support <8-byte scalar spill and refill")
Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f3001937bbb9..f2f1ed34cfe9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2379,8 +2379,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 		 */
 		if (insn->src_reg != BPF_REG_FP)
 			return 0;
-		if (BPF_SIZE(insn->code) != BPF_DW)
-			return 0;
 
 		/* dreg = *(u64 *)[fp - off] was a fill from the stack.
 		 * that [fp - off] slot contains scalar that needs to be
@@ -2403,8 +2401,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 		/* scalars can only be spilled into stack */
 		if (insn->dst_reg != BPF_REG_FP)
 			return 0;
-		if (BPF_SIZE(insn->code) != BPF_DW)
-			return 0;
 		spi = (-insn->off - 1) / BPF_REG_SIZE;
 		if (spi >= 64) {
 			verbose(env, "BUG spi %d\n", spi);

From 0be2516f865f5a876837184a8385163ff64a5889 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Fri, 10 Dec 2021 00:47:00 +0100
Subject: [PATCH 02/15] selftests/bpf: Tests for state pruning with u32
 spill/fill

This patch adds tests for the verifier's tracking for spilled, <8B
registers. The first two test cases ensure the verifier doesn't
incorrectly prune states in case of <8B spill/fills. The last one simply
checks that a filled u64 register is marked unknown if the register
spilled in the same slack slot was less than 8B.

The map value access at the end of the first program is only incorrect
for the path R6=32. If the precision bit for register R8 isn't
backtracked through the u32 spill/fill, the R6=32 path is pruned at
instruction 9 and the program is incorrectly accepted. The second
program is a variation of the same with u32 spills and a u64 fill.

The additional instructions to introduce the first pruning point may be
a bit fragile as they depend on the heuristics for pruning points in the
verifier (currently at least 8 instructions and 2 jumps). If the
heuristics are changed, the pruning point may move (e.g., to the
subsequent jump) or disappear, which would cause the test to always pass.

Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/search_pruning.c   | 71 +++++++++++++++++++
 .../selftests/bpf/verifier/spill_fill.c       | 32 +++++++++
 2 files changed, 103 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/search_pruning.c b/tools/testing/selftests/bpf/verifier/search_pruning.c
index 7e50cb80873a..682519769fe3 100644
--- a/tools/testing/selftests/bpf/verifier/search_pruning.c
+++ b/tools/testing/selftests/bpf/verifier/search_pruning.c
@@ -132,6 +132,77 @@
 	.result = REJECT,
 	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 },
+{
+	"precision tracking for u32 spill/fill",
+	.insns = {
+		BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
+		BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+		BPF_MOV32_IMM(BPF_REG_6, 32),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_MOV32_IMM(BPF_REG_6, 4),
+		/* Additional insns to introduce a pruning point. */
+		BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+		BPF_MOV64_IMM(BPF_REG_3, 0),
+		BPF_MOV64_IMM(BPF_REG_3, 0),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 0),
+		/* u32 spill/fill */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -8),
+		BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_10, -8),
+		/* out-of-bound map value access for r6=32 */
+		BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+		BPF_LD_MAP_FD(BPF_REG_1, 0),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+		BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_8),
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_8b = { 15 },
+	.result = REJECT,
+	.errstr = "R0 min value is outside of the allowed memory range",
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"precision tracking for u32 spills, u64 fill",
+	.insns = {
+		BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+		BPF_MOV32_IMM(BPF_REG_7, 0xffffffff),
+		/* Additional insns to introduce a pruning point. */
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		BPF_ALU32_IMM(BPF_DIV, BPF_REG_3, 0),
+		/* u32 spills, u64 fill */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -4),
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, -8),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, -8),
+		/* if r8 != X goto pc+1  r8 known in fallthrough branch */
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_8, 0xffffffff, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		/* if r8 == X goto pc+1  condition always true on first
+		 * traversal, so starts backtracking to mark r8 as requiring
+		 * precision. r7 marked as needing precision. r6 not marked
+		 * since it's not tracked.
+		 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 0xffffffff, 1),
+		/* fails if r8 correctly marked unknown after fill. */
+		BPF_ALU32_IMM(BPF_DIV, BPF_REG_3, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "div by zero",
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
 {
 	"allocated_stack",
 	.insns = {
diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c
index 7ab3de108761..6c907144311f 100644
--- a/tools/testing/selftests/bpf/verifier/spill_fill.c
+++ b/tools/testing/selftests/bpf/verifier/spill_fill.c
@@ -175,6 +175,38 @@
 	.errstr = "invalid access to packet",
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 },
+{
+	"Spill u32 const scalars.  Refill as u64.  Offset to skb->data",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct __sk_buff, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct __sk_buff, data_end)),
+	/* r6 = 0 */
+	BPF_MOV32_IMM(BPF_REG_6, 0),
+	/* r7 = 20 */
+	BPF_MOV32_IMM(BPF_REG_7, 20),
+	/* *(u32 *)(r10 -4) = r6 */
+	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -4),
+	/* *(u32 *)(r10 -8) = r7 */
+	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, -8),
+	/* r4 = *(u64 *)(r10 -8) */
+	BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_10, -8),
+	/* r0 = r2 */
+	BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+	/* r0 += r4 R0=pkt R2=pkt R3=pkt_end R4=inv,umax=65535 */
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_4),
+	/* if (r0 > r3) R0=pkt,umax=65535 R2=pkt R3=pkt_end R4=inv,umax=65535 */
+	BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+	/* r0 = *(u32 *)r2 R0=pkt,umax=65535 R2=pkt R3=pkt_end R4=inv20 */
+	BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "invalid access to packet",
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
 {
 	"Spill a u32 const scalar.  Refill as u16 from fp-6.  Offset to skb->data",
 	.insns = {

From bd0687c18e635b63233dc87f38058cd728802ab4 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Tue, 14 Dec 2021 11:26:07 +0100
Subject: [PATCH 03/15] xsk: Do not sleep in poll() when need_wakeup set

Do not sleep in poll() when the need_wakeup flag is set. When this
flag is set, the application needs to explicitly wake up the driver
with a syscall (poll, recvmsg, sendmsg, etc.) to guarantee that Rx
and/or Tx processing will be processed promptly. But the current code
in poll(), sleeps first then wakes up the driver. This means that no
driver processing will occur (baring any interrupts) until the timeout
has expired.

Fix this by checking the need_wakeup flag first and if set, wake the
driver and return to the application. Only if need_wakeup is not set
should the process sleep if there is a timeout set in the poll() call.

Fixes: 77cd0d7b3f25 ("xsk: add support for need_wakeup flag in AF_XDP rings")
Reported-by: Keith Wiles <keith.wiles@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20211214102607.7677-1-magnus.karlsson@gmail.com
---
 net/xdp/xsk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f16074eb53c7..7a466ea962c5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -677,8 +677,6 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
 	struct xdp_sock *xs = xdp_sk(sk);
 	struct xsk_buff_pool *pool;
 
-	sock_poll_wait(file, sock, wait);
-
 	if (unlikely(!xsk_is_bound(xs)))
 		return mask;
 
@@ -690,6 +688,8 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
 		else
 			/* Poll needs to drive Tx also in copy mode */
 			__xsk_sendmsg(sk);
+	} else {
+		sock_poll_wait(file, sock, wait);
 	}
 
 	if (xs->rx && !xskq_prod_is_empty(xs->rx))

From f7abc4c8df8c7930d0b9c56d9abee9a1fca635e9 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 14 Dec 2021 07:18:00 +0530
Subject: [PATCH 04/15] selftests/bpf: Fix OOB write in test_verifier

The commit referenced below added fixup_map_timer support (to create a
BPF map containing timers), but failed to increase the size of the
map_fds array, leading to out of bounds write. Fix this by changing
MAX_NR_MAPS to 22.

Fixes: e60e6962c503 ("selftests/bpf: Add tests for restricted helpers")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211214014800.78762-1-memxor@gmail.com
---
 tools/testing/selftests/bpf/test_verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 465ef3f112c0..d3bf83d5c6cf 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -54,7 +54,7 @@
 #define MAX_INSNS	BPF_MAXINSNS
 #define MAX_TEST_INSNS	1000000
 #define MAX_FIXUPS	8
-#define MAX_NR_MAPS	21
+#define MAX_NR_MAPS	22
 #define MAX_TEST_RUNS	8
 #define POINTER_VALUE	0xcafe4all
 #define TEST_DATA_LEN	64

From 7d3baf0afa3aa9102d6a521a8e4c41888bb79882 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 7 Dec 2021 12:51:56 +0000
Subject: [PATCH 05/15] bpf: Fix kernel address leakage in atomic fetch

The change in commit 37086bfdc737 ("bpf: Propagate stack bounds to registers
in atomics w/ BPF_FETCH") around check_mem_access() handling is buggy since
this would allow for unprivileged users to leak kernel pointers. For example,
an atomic fetch/and with -1 on a stack destination which holds a spilled
pointer will migrate the spilled register type into a scalar, which can then
be exported out of the program (since scalar != pointer) by dumping it into
a map value.

The original implementation of XADD was preventing this situation by using
a double call to check_mem_access() one with BPF_READ and a subsequent one
with BPF_WRITE, in both cases passing -1 as a placeholder value instead of
register as per XADD semantics since it didn't contain a value fetch. The
BPF_READ also included a check in check_stack_read_fixed_off() which rejects
the program if the stack slot is of __is_pointer_value() if dst_regno < 0.
The latter is to distinguish whether we're dealing with a regular stack spill/
fill or some arithmetical operation which is disallowed on non-scalars, see
also 6e7e63cbb023 ("bpf: Forbid XADD on spilled pointers for unprivileged
users") for more context on check_mem_access() and its handling of placeholder
value -1.

One minimally intrusive option to fix the leak is for the BPF_FETCH case to
initially check the BPF_READ case via check_mem_access() with -1 as register,
followed by the actual load case with non-negative load_reg to propagate
stack bounds to registers.

Fixes: 37086bfdc737 ("bpf: Propagate stack bounds to registers in atomics w/ BPF_FETCH")
Reported-by: <n4ke4mry@gmail.com>
Acked-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f2f1ed34cfe9..53d39db3b0fa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4584,13 +4584,19 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 		load_reg = -1;
 	}
 
-	/* check whether we can read the memory */
+	/* Check whether we can read the memory, with second call for fetch
+	 * case to simulate the register fill.
+	 */
 	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
-			       BPF_SIZE(insn->code), BPF_READ, load_reg, true);
+			       BPF_SIZE(insn->code), BPF_READ, -1, true);
+	if (!err && load_reg >= 0)
+		err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+				       BPF_SIZE(insn->code), BPF_READ, load_reg,
+				       true);
 	if (err)
 		return err;
 
-	/* check whether we can write into the same memory */
+	/* Check whether we can write into the same memory. */
 	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 			       BPF_SIZE(insn->code), BPF_WRITE, -1, true);
 	if (err)

From 180486b430f4e22cc00a478163d942804baae4b5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 7 Dec 2021 10:07:04 +0000
Subject: [PATCH 06/15] bpf, selftests: Add test case for atomic fetch on
 spilled pointer

Test whether unprivileged would be able to leak the spilled pointer either
by exporting the returned value from the atomic{32,64} operation or by reading
and exporting the value from the stack after the atomic operation took place.

Note that for unprivileged, the below atomic cmpxchg test case named "Dest
pointer in r0 - succeed" is failing. The reason is that in the dst memory
location (r10 -8) there is the spilled register r10:

  0: R1=ctx(id=0,off=0,imm=0) R10=fp0
  0: (bf) r0 = r10
  1: R0_w=fp0 R1=ctx(id=0,off=0,imm=0) R10=fp0
  1: (7b) *(u64 *)(r10 -8) = r0
  2: R0_w=fp0 R1=ctx(id=0,off=0,imm=0) R10=fp0 fp-8_w=fp
  2: (b7) r1 = 0
  3: R0_w=fp0 R1_w=invP0 R10=fp0 fp-8_w=fp
  3: (db) r0 = atomic64_cmpxchg((u64 *)(r10 -8), r0, r1)
  4: R0_w=fp0 R1_w=invP0 R10=fp0 fp-8_w=mmmmmmmm
  4: (79) r1 = *(u64 *)(r0 -8)
  5: R0_w=fp0 R1_w=invP(id=0) R10=fp0 fp-8_w=mmmmmmmm
  5: (b7) r0 = 0
  6: R0_w=invP0 R1_w=invP(id=0) R10=fp0 fp-8_w=mmmmmmmm
  6: (95) exit

However, allowing this case for unprivileged is a bit useless given an
update with a new pointer will fail anyway:

  0: R1=ctx(id=0,off=0,imm=0) R10=fp0
  0: (bf) r0 = r10
  1: R0_w=fp0 R1=ctx(id=0,off=0,imm=0) R10=fp0
  1: (7b) *(u64 *)(r10 -8) = r0
  2: R0_w=fp0 R1=ctx(id=0,off=0,imm=0) R10=fp0 fp-8_w=fp
  2: (db) r0 = atomic64_cmpxchg((u64 *)(r10 -8), r0, r10)
  R10 leaks addr into mem

Acked-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/atomic_cmpxchg.c   | 23 +++++
 .../selftests/bpf/verifier/atomic_fetch.c     | 94 +++++++++++++++++++
 2 files changed, 117 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
index c22dc83a41fd..0ffc69f602af 100644
--- a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
+++ b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
@@ -156,4 +156,27 @@
 		BPF_EXIT_INSN(),
 	},
 	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "leaking pointer from stack off -8",
+},
+{
+	"Dest pointer in r0 - succeed, check 2",
+	.insns = {
+		/* r0 = &val */
+		BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+		/* val = r0; */
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+		/* r5 = &val */
+		BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+		/* r0 = atomic_cmpxchg(&val, r0, r5); */
+		BPF_ATOMIC_OP(BPF_DW, BPF_CMPXCHG, BPF_REG_10, BPF_REG_5, -8),
+		/* r1 = *r0 */
+		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8),
+		/* exit(0); */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "R5 leaks addr into mem",
 },
diff --git a/tools/testing/selftests/bpf/verifier/atomic_fetch.c b/tools/testing/selftests/bpf/verifier/atomic_fetch.c
index 3bc9ff7a860b..5bf03fb4fa2b 100644
--- a/tools/testing/selftests/bpf/verifier/atomic_fetch.c
+++ b/tools/testing/selftests/bpf/verifier/atomic_fetch.c
@@ -1,3 +1,97 @@
+{
+	"atomic dw/fetch and address leakage of (map ptr & -1) via stack slot",
+	.insns = {
+		BPF_LD_IMM64(BPF_REG_1, -1),
+		BPF_LD_MAP_FD(BPF_REG_8, 0),
+		BPF_LD_MAP_FD(BPF_REG_9, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
+		BPF_ATOMIC_OP(BPF_DW, BPF_AND | BPF_FETCH, BPF_REG_2, BPF_REG_1, 0),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_2, 0),
+		BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 2, 4 },
+	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "leaking pointer from stack off -8",
+},
+{
+	"atomic dw/fetch and address leakage of (map ptr & -1) via returned value",
+	.insns = {
+		BPF_LD_IMM64(BPF_REG_1, -1),
+		BPF_LD_MAP_FD(BPF_REG_8, 0),
+		BPF_LD_MAP_FD(BPF_REG_9, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
+		BPF_ATOMIC_OP(BPF_DW, BPF_AND | BPF_FETCH, BPF_REG_2, BPF_REG_1, 0),
+		BPF_MOV64_REG(BPF_REG_9, BPF_REG_1),
+		BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 2, 4 },
+	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "leaking pointer from stack off -8",
+},
+{
+	"atomic w/fetch and address leakage of (map ptr & -1) via stack slot",
+	.insns = {
+		BPF_LD_IMM64(BPF_REG_1, -1),
+		BPF_LD_MAP_FD(BPF_REG_8, 0),
+		BPF_LD_MAP_FD(BPF_REG_9, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
+		BPF_ATOMIC_OP(BPF_W, BPF_AND | BPF_FETCH, BPF_REG_2, BPF_REG_1, 0),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_2, 0),
+		BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 2, 4 },
+	.result = REJECT,
+	.errstr = "invalid size of register fill",
+},
+{
+	"atomic w/fetch and address leakage of (map ptr & -1) via returned value",
+	.insns = {
+		BPF_LD_IMM64(BPF_REG_1, -1),
+		BPF_LD_MAP_FD(BPF_REG_8, 0),
+		BPF_LD_MAP_FD(BPF_REG_9, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
+		BPF_ATOMIC_OP(BPF_W, BPF_AND | BPF_FETCH, BPF_REG_2, BPF_REG_1, 0),
+		BPF_MOV64_REG(BPF_REG_9, BPF_REG_1),
+		BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 2, 4 },
+	.result = REJECT,
+	.errstr = "invalid size of register fill",
+},
 #define __ATOMIC_FETCH_OP_TEST(src_reg, dst_reg, operand1, op, operand2, expect) \
 	{								\
 		"atomic fetch " #op ", src=" #dst_reg " dst=" #dst_reg,	\

From a82fe085f344ef20b452cd5f481010ff96b5c4cd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 7 Dec 2021 11:02:02 +0000
Subject: [PATCH 07/15] bpf: Fix kernel address leakage in atomic cmpxchg's r0
 aux reg

The implementation of BPF_CMPXCHG on a high level has the following parameters:

  .-[old-val]                                          .-[new-val]
  BPF_R0 = cmpxchg{32,64}(DST_REG + insn->off, BPF_R0, SRC_REG)
                          `-[mem-loc]          `-[old-val]

Given a BPF insn can only have two registers (dst, src), the R0 is fixed and
used as an auxilliary register for input (old value) as well as output (returning
old value from memory location). While the verifier performs a number of safety
checks, it misses to reject unprivileged programs where R0 contains a pointer as
old value.

Through brute-forcing it takes about ~16sec on my machine to leak a kernel pointer
with BPF_CMPXCHG. The PoC is basically probing for kernel addresses by storing the
guessed address into the map slot as a scalar, and using the map value pointer as
R0 while SRC_REG has a canary value to detect a matching address.

Fix it by checking R0 for pointers, and reject if that's the case for unprivileged
programs.

Fixes: 5ffa25502b5a ("bpf: Add instructions for atomic_[cmp]xchg")
Reported-by: Ryota Shiga (Flatt Security)
Acked-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 53d39db3b0fa..2d48159b58bd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4547,9 +4547,16 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 
 	if (insn->imm == BPF_CMPXCHG) {
 		/* Check comparison of R0 with memory location */
-		err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+		const u32 aux_reg = BPF_REG_0;
+
+		err = check_reg_arg(env, aux_reg, SRC_OP);
 		if (err)
 			return err;
+
+		if (is_pointer_value(env, aux_reg)) {
+			verbose(env, "R%d leaks addr into mem\n", aux_reg);
+			return -EACCES;
+		}
 	}
 
 	if (is_pointer_value(env, insn->src_reg)) {

From e523102cb719cbad1673b6aa2a4d5c1fa6f13799 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 13 Dec 2021 22:25:23 +0000
Subject: [PATCH 08/15] bpf, selftests: Update test case for atomic cmpxchg on
 r0 with pointer

Fix up unprivileged test case results for 'Dest pointer in r0' verifier tests
given they now need to reject R0 containing a pointer value, and add a couple
of new related ones with 32bit cmpxchg as well.

  root@foo:~/bpf/tools/testing/selftests/bpf# ./test_verifier
  #0/u invalid and of negative number OK
  #0/p invalid and of negative number OK
  [...]
  #1268/p XDP pkt read, pkt_meta' <= pkt_data, bad access 1 OK
  #1269/p XDP pkt read, pkt_meta' <= pkt_data, bad access 2 OK
  #1270/p XDP pkt read, pkt_data <= pkt_meta', good access OK
  #1271/p XDP pkt read, pkt_data <= pkt_meta', bad access 1 OK
  #1272/p XDP pkt read, pkt_data <= pkt_meta', bad access 2 OK
  Summary: 1900 PASSED, 0 SKIPPED, 0 FAILED

Acked-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/atomic_cmpxchg.c   | 67 ++++++++++++++++++-
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
index 0ffc69f602af..b39665f33524 100644
--- a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
+++ b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
@@ -138,6 +138,8 @@
 		BPF_EXIT_INSN(),
 	},
 	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "R0 leaks addr into mem",
 },
 {
 	"Dest pointer in r0 - succeed",
@@ -157,7 +159,7 @@
 	},
 	.result = ACCEPT,
 	.result_unpriv = REJECT,
-	.errstr_unpriv = "leaking pointer from stack off -8",
+	.errstr_unpriv = "R0 leaks addr into mem",
 },
 {
 	"Dest pointer in r0 - succeed, check 2",
@@ -178,5 +180,66 @@
 	},
 	.result = ACCEPT,
 	.result_unpriv = REJECT,
-	.errstr_unpriv = "R5 leaks addr into mem",
+	.errstr_unpriv = "R0 leaks addr into mem",
+},
+{
+	"Dest pointer in r0 - succeed, check 3",
+	.insns = {
+		/* r0 = &val */
+		BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+		/* val = r0; */
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+		/* r5 = &val */
+		BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+		/* r0 = atomic_cmpxchg(&val, r0, r5); */
+		BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_5, -8),
+		/* exit(0); */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "invalid size of register fill",
+	.errstr_unpriv = "R0 leaks addr into mem",
+},
+{
+	"Dest pointer in r0 - succeed, check 4",
+	.insns = {
+		/* r0 = &val */
+		BPF_MOV32_REG(BPF_REG_0, BPF_REG_10),
+		/* val = r0; */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -8),
+		/* r5 = &val */
+		BPF_MOV32_REG(BPF_REG_5, BPF_REG_10),
+		/* r0 = atomic_cmpxchg(&val, r0, r5); */
+		BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_5, -8),
+		/* r1 = *r10 */
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -8),
+		/* exit(0); */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "R10 partial copy of pointer",
+},
+{
+	"Dest pointer in r0 - succeed, check 5",
+	.insns = {
+		/* r0 = &val */
+		BPF_MOV32_REG(BPF_REG_0, BPF_REG_10),
+		/* val = r0; */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -8),
+		/* r5 = &val */
+		BPF_MOV32_REG(BPF_REG_5, BPF_REG_10),
+		/* r0 = atomic_cmpxchg(&val, r0, r5); */
+		BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_5, -8),
+		/* r1 = *r0 */
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, -8),
+		/* exit(0); */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "R0 invalid mem access",
+	.errstr_unpriv = "R10 partial copy of pointer",
 },

From 3cf2b61eb06765e27fec6799292d9fb46d0b7e60 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Dec 2021 22:02:19 +0000
Subject: [PATCH 09/15] bpf: Fix signed bounds propagation after mov32

For the case where both s32_{min,max}_value bounds are positive, the
__reg_assign_32_into_64() directly propagates them to their 64 bit
counterparts, otherwise it pessimises them into [0,u32_max] universe and
tries to refine them later on by learning through the tnum as per comment
in mentioned function. However, that does not always happen, for example,
in mov32 operation we call zext_32_to_64(dst_reg) which invokes the
__reg_assign_32_into_64() as is without subsequent bounds update as
elsewhere thus no refinement based on tnum takes place.

Thus, not calling into the __update_reg_bounds() / __reg_deduce_bounds() /
__reg_bound_offset() triplet as we do, for example, in case of ALU ops via
adjust_scalar_min_max_vals(), will lead to more pessimistic bounds when
dumping the full register state:

Before fix:

  0: (b4) w0 = -1
  1: R0_w=invP4294967295
     (id=0,imm=ffffffff,
      smin_value=4294967295,smax_value=4294967295,
      umin_value=4294967295,umax_value=4294967295,
      var_off=(0xffffffff; 0x0),
      s32_min_value=-1,s32_max_value=-1,
      u32_min_value=-1,u32_max_value=-1)

  1: (bc) w0 = w0
  2: R0_w=invP4294967295
     (id=0,imm=ffffffff,
      smin_value=0,smax_value=4294967295,
      umin_value=4294967295,umax_value=4294967295,
      var_off=(0xffffffff; 0x0),
      s32_min_value=-1,s32_max_value=-1,
      u32_min_value=-1,u32_max_value=-1)

Technically, the smin_value=0 and smax_value=4294967295 bounds are not
incorrect, but given the register is still a constant, they break assumptions
about const scalars that smin_value == smax_value and umin_value == umax_value.

After fix:

  0: (b4) w0 = -1
  1: R0_w=invP4294967295
     (id=0,imm=ffffffff,
      smin_value=4294967295,smax_value=4294967295,
      umin_value=4294967295,umax_value=4294967295,
      var_off=(0xffffffff; 0x0),
      s32_min_value=-1,s32_max_value=-1,
      u32_min_value=-1,u32_max_value=-1)

  1: (bc) w0 = w0
  2: R0_w=invP4294967295
     (id=0,imm=ffffffff,
      smin_value=4294967295,smax_value=4294967295,
      umin_value=4294967295,umax_value=4294967295,
      var_off=(0xffffffff; 0x0),
      s32_min_value=-1,s32_max_value=-1,
      u32_min_value=-1,u32_max_value=-1)

Without the smin_value == smax_value and umin_value == umax_value invariant
being intact for const scalars, it is possible to leak out kernel pointers
from unprivileged user space if the latter is enabled. For example, when such
registers are involved in pointer arithmtics, then adjust_ptr_min_max_vals()
will taint the destination register into an unknown scalar, and the latter
can be exported and stored e.g. into a BPF map value.

Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking")
Reported-by: Kuee K1r0a <liulin063@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2d48159b58bd..0872b6c9fb33 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8317,6 +8317,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 							 insn->dst_reg);
 				}
 				zext_32_to_64(dst_reg);
+
+				__update_reg_bounds(dst_reg);
+				__reg_deduce_bounds(dst_reg);
+				__reg_bound_offset(dst_reg);
 			}
 		} else {
 			/* case: R = imm

From e572ff80f05c33cd0cb4860f864f5c9c044280b6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Dec 2021 22:28:48 +0000
Subject: [PATCH 10/15] bpf: Make 32->64 bounds propagation slightly more
 robust

Make the bounds propagation in __reg_assign_32_into_64() slightly more
robust and readable by aligning it similarly as we did back in the
__reg_combine_64_into_32() counterpart. Meaning, only propagate or
pessimize them as a smin/smax pair.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0872b6c9fb33..b532f1058d35 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1366,22 +1366,28 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
 	reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
 }
 
+static bool __reg32_bound_s64(s32 a)
+{
+	return a >= 0 && a <= S32_MAX;
+}
+
 static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
 {
 	reg->umin_value = reg->u32_min_value;
 	reg->umax_value = reg->u32_max_value;
-	/* Attempt to pull 32-bit signed bounds into 64-bit bounds
-	 * but must be positive otherwise set to worse case bounds
-	 * and refine later from tnum.
+
+	/* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
+	 * be positive otherwise set to worse case bounds and refine later
+	 * from tnum.
 	 */
-	if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0)
-		reg->smax_value = reg->s32_max_value;
-	else
-		reg->smax_value = U32_MAX;
-	if (reg->s32_min_value >= 0)
+	if (__reg32_bound_s64(reg->s32_min_value) &&
+	    __reg32_bound_s64(reg->s32_max_value)) {
 		reg->smin_value = reg->s32_min_value;
-	else
+		reg->smax_value = reg->s32_max_value;
+	} else {
 		reg->smin_value = 0;
+		reg->smax_value = U32_MAX;
+	}
 }
 
 static void __reg_combine_32_into_64(struct bpf_reg_state *reg)

From b1a7288dedc6caf9023f2676b4f5ed34cf0d4029 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Dec 2021 23:48:54 +0000
Subject: [PATCH 11/15] bpf, selftests: Add test case trying to taint map value
 pointer

Add a test case which tries to taint map value pointer arithmetic into a
unknown scalar with subsequent export through the map.

Before fix:

  # ./test_verifier 1186
  #1186/u map access: trying to leak tained dst reg FAIL
  Unexpected success to load!
  verification time 24 usec
  stack depth 8
  processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1
  #1186/p map access: trying to leak tained dst reg FAIL
  Unexpected success to load!
  verification time 8 usec
  stack depth 8
  processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1
  Summary: 0 PASSED, 0 SKIPPED, 2 FAILED

After fix:

  # ./test_verifier 1186
  #1186/u map access: trying to leak tained dst reg OK
  #1186/p map access: trying to leak tained dst reg OK
  Summary: 2 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/value_ptr_arith.c  | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
index 2debba4e8a3a..4d347bc53aa2 100644
--- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
+++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
@@ -1077,6 +1077,29 @@
 	.errstr = "R0 invalid mem access 'inv'",
 	.errstr_unpriv = "R0 pointer -= pointer prohibited",
 },
+{
+	"map access: trying to leak tained dst reg",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_MOV32_IMM(BPF_REG_1, 0xFFFFFFFF),
+	BPF_MOV32_REG(BPF_REG_1, BPF_REG_1),
+	BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+	BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 4 },
+	.result = REJECT,
+	.errstr = "math between map_value pointer and 4294967295 is not allowed",
+},
 {
 	"32bit pkt_ptr -= scalar",
 	.insns = {

From 433956e91200734d09958673a56df02d00a917c2 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 15 Dec 2021 18:38:30 -0800
Subject: [PATCH 12/15] bpf: Fix extable fixup offset.

The prog - start_of_ldx is the offset before the faulting ldx to the location
after it, so this will be used to adjust pt_regs->ip for jumping over it and
continuing, and with old temp it would have been fixed up to the wrong offset,
causing crash.

Fixes: 4c5de127598e ("bpf: Emit explicit NULL pointer checks for PROBE_LDX instructions.")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/x86/net/bpf_jit_comp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 726700fabca6..fa58681db45e 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1305,7 +1305,7 @@ st:			if (is_imm8(insn->off))
 				 * End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
 				 * of 4 bytes will be ignored and rbx will be zero inited.
 				 */
-				ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8);
+				ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8);
 			}
 			break;
 

From 588a25e92458c6efeb7a261d5ca5726f5de89184 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 14 Dec 2021 19:25:13 -0800
Subject: [PATCH 13/15] bpf: Fix extable address check.

The verifier checks that PTR_TO_BTF_ID pointer is either valid or NULL,
but it cannot distinguish IS_ERR pointer from valid one.

When offset is added to IS_ERR pointer it may become small positive
value which is a user address that is not handled by extable logic
and has to be checked for at the runtime.

Tighten BPF_PROBE_MEM pointer check code to prevent this case.

Fixes: 4c5de127598e ("bpf: Emit explicit NULL pointer checks for PROBE_LDX instructions.")
Reported-by: Lorenzo Fontana <lorenzo.fontana@elastic.co>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/x86/net/bpf_jit_comp.c | 49 +++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index fa58681db45e..bafe36e69227 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1252,19 +1252,54 @@ st:			if (is_imm8(insn->off))
 		case BPF_LDX | BPF_MEM | BPF_DW:
 		case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
 			if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
-				/* test src_reg, src_reg */
-				maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
-				EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
-				/* jne start_of_ldx */
-				EMIT2(X86_JNE, 0);
+				/* Though the verifier prevents negative insn->off in BPF_PROBE_MEM
+				 * add abs(insn->off) to the limit to make sure that negative
+				 * offset won't be an issue.
+				 * insn->off is s16, so it won't affect valid pointers.
+				 */
+				u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off);
+				u8 *end_of_jmp1, *end_of_jmp2;
+
+				/* Conservatively check that src_reg + insn->off is a kernel address:
+				 * 1. src_reg + insn->off >= limit
+				 * 2. src_reg + insn->off doesn't become small positive.
+				 * Cannot do src_reg + insn->off >= limit in one branch,
+				 * since it needs two spare registers, but JIT has only one.
+				 */
+
+				/* movabsq r11, limit */
+				EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
+				EMIT((u32)limit, 4);
+				EMIT(limit >> 32, 4);
+				/* cmp src_reg, r11 */
+				maybe_emit_mod(&prog, src_reg, AUX_REG, true);
+				EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
+				/* if unsigned '<' goto end_of_jmp2 */
+				EMIT2(X86_JB, 0);
+				end_of_jmp1 = prog;
+
+				/* mov r11, src_reg */
+				emit_mov_reg(&prog, true, AUX_REG, src_reg);
+				/* add r11, insn->off */
+				maybe_emit_1mod(&prog, AUX_REG, true);
+				EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
+				/* jmp if not carry to start_of_ldx
+				 * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr
+				 * that has to be rejected.
+				 */
+				EMIT2(0x73 /* JNC */, 0);
+				end_of_jmp2 = prog;
+
 				/* xor dst_reg, dst_reg */
 				emit_mov_imm32(&prog, false, dst_reg, 0);
 				/* jmp byte_after_ldx */
 				EMIT2(0xEB, 0);
 
-				/* populate jmp_offset for JNE above */
-				temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
+				/* populate jmp_offset for JB above to jump to xor dst_reg */
+				end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1;
+				/* populate jmp_offset for JNC above to jump to start_of_ldx */
 				start_of_ldx = prog;
+				end_of_jmp2[-1] = start_of_ldx - end_of_jmp2;
 			}
 			emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
 			if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {

From 7edc3fcbf9a2b2e3df53c9656a9f85bf807affac Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 15 Dec 2021 12:35:34 -0800
Subject: [PATCH 14/15] selftest/bpf: Add a test that reads various addresses.

Add a function to bpf_testmod that returns invalid kernel and user addresses.
Then attach an fexit program to that function that tries to read
memory through these addresses.

This logic checks that bpf_probe_read_kernel and BPF_PROBE_MEM logic is sane.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../selftests/bpf/bpf_testmod/bpf_testmod.c   | 20 +++++++++++++++++++
 .../selftests/bpf/progs/test_module_attach.c  | 12 +++++++++++
 2 files changed, 32 insertions(+)

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 5d52ea2768df..df3b292a8ffe 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -33,6 +33,22 @@ noinline int bpf_testmod_loop_test(int n)
 	return sum;
 }
 
+__weak noinline struct file *bpf_testmod_return_ptr(int arg)
+{
+	static struct file f = {};
+
+	switch (arg) {
+	case 1: return (void *)EINVAL;		/* user addr */
+	case 2: return (void *)0xcafe4a11;	/* user addr */
+	case 3: return (void *)-EINVAL;		/* canonical, but invalid */
+	case 4: return (void *)(1ull << 60);	/* non-canonical and invalid */
+	case 5: return (void *)~(1ull << 30);	/* trigger extable */
+	case 6: return &f;			/* valid addr */
+	case 7: return (void *)((long)&f | 1);	/* kernel tricks */
+	default: return NULL;
+	}
+}
+
 noinline ssize_t
 bpf_testmod_test_read(struct file *file, struct kobject *kobj,
 		      struct bin_attribute *bin_attr,
@@ -43,6 +59,10 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj,
 		.off = off,
 		.len = len,
 	};
+	int i = 1;
+
+	while (bpf_testmod_return_ptr(i))
+		i++;
 
 	/* This is always true. Use the check to make sure the compiler
 	 * doesn't remove bpf_testmod_loop_test.
diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c
index b36857093f71..50ce16d02da7 100644
--- a/tools/testing/selftests/bpf/progs/test_module_attach.c
+++ b/tools/testing/selftests/bpf/progs/test_module_attach.c
@@ -87,6 +87,18 @@ int BPF_PROG(handle_fexit,
 	return 0;
 }
 
+SEC("fexit/bpf_testmod_return_ptr")
+int BPF_PROG(handle_fexit_ret, int arg, struct file *ret)
+{
+	long buf = 0;
+
+	bpf_probe_read_kernel(&buf, 8, ret);
+	bpf_probe_read_kernel(&buf, 8, (char *)ret + 256);
+	*(volatile long long *)ret;
+	*(volatile int *)&ret->f_mode;
+	return 0;
+}
+
 __u32 fmod_ret_read_sz = 0;
 
 SEC("fmod_ret/bpf_testmod_test_read")

From c2fcbf81c332b42382a0c439bfe2414a241e4f5b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 16 Dec 2021 11:16:30 -0800
Subject: [PATCH 15/15] bpf, selftests: Fix racing issue in btf_skc_cls_ingress
 test

The libbpf CI reported occasional failure in btf_skc_cls_ingress:

  test_syncookie:FAIL:Unexpected syncookie states gen_cookie:80326634 recv_cookie:0
  bpf prog error at line 97

"error at line 97" means the bpf prog cannot find the listening socket
when the final ack is received.  It then skipped processing
the syncookie in the final ack which then led to "recv_cookie:0".

The problem is the userspace program did not do accept() and went
ahead to close(listen_fd) before the kernel (and the bpf prog) had
a chance to process the final ack.

The fix is to add accept() call so that the userspace will wait for
the kernel to finish processing the final ack first before close()-ing
everything.

Fixes: 9a856cae2217 ("bpf: selftest: Add test_btf_skc_cls_ingress")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211216191630.466151-1-kafai@fb.com
---
 .../bpf/prog_tests/btf_skc_cls_ingress.c         | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
index 762f6a9da8b5..664ffc0364f4 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
@@ -90,7 +90,7 @@ static void print_err_line(void)
 
 static void test_conn(void)
 {
-	int listen_fd = -1, cli_fd = -1, err;
+	int listen_fd = -1, cli_fd = -1, srv_fd = -1, err;
 	socklen_t addrlen = sizeof(srv_sa6);
 	int srv_port;
 
@@ -112,6 +112,10 @@ static void test_conn(void)
 	if (CHECK_FAIL(cli_fd == -1))
 		goto done;
 
+	srv_fd = accept(listen_fd, NULL, NULL);
+	if (CHECK_FAIL(srv_fd == -1))
+		goto done;
+
 	if (CHECK(skel->bss->listen_tp_sport != srv_port ||
 		  skel->bss->req_sk_sport != srv_port,
 		  "Unexpected sk src port",
@@ -134,11 +138,13 @@ done:
 		close(listen_fd);
 	if (cli_fd != -1)
 		close(cli_fd);
+	if (srv_fd != -1)
+		close(srv_fd);
 }
 
 static void test_syncookie(void)
 {
-	int listen_fd = -1, cli_fd = -1, err;
+	int listen_fd = -1, cli_fd = -1, srv_fd = -1, err;
 	socklen_t addrlen = sizeof(srv_sa6);
 	int srv_port;
 
@@ -161,6 +167,10 @@ static void test_syncookie(void)
 	if (CHECK_FAIL(cli_fd == -1))
 		goto done;
 
+	srv_fd = accept(listen_fd, NULL, NULL);
+	if (CHECK_FAIL(srv_fd == -1))
+		goto done;
+
 	if (CHECK(skel->bss->listen_tp_sport != srv_port,
 		  "Unexpected tp src port",
 		  "listen_tp_sport:%u expected:%u\n",
@@ -188,6 +198,8 @@ done:
 		close(listen_fd);
 	if (cli_fd != -1)
 		close(cli_fd);
+	if (srv_fd != -1)
+		close(srv_fd);
 }
 
 struct test {