From 1e660f7ebe0ff6ac65ee0000280392d878630a67 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 6 Sep 2022 19:38:53 -0700 Subject: [PATCH 001/143] bpf: Replace __ksize with ksize. __ksize() was made private. Use ksize() instead. Reported-by: Stephen Rothwell Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 5cc952da7d41..20621f5407d8 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -610,7 +610,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) if (!ptr) return; - idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ)); + idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); if (idx < 0) return; From 720e6a435194fb5237833a4a7ec6aa60a78964a8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:26:46 -0700 Subject: [PATCH 002/143] bpf: Allow struct argument in trampoline based programs Allow struct argument in trampoline based programs where the struct size should be <= 16 bytes. In such cases, the argument will be put into up to 2 registers for bpf, x86_64 and arm64 architectures. To support arch-specific trampoline manipulation, add arg_flags for additional struct information about arguments in btf_func_model. Such information will be used in arch specific function arch_prepare_bpf_trampoline() to prepare argument access properly in trampoline. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152646.2078089-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++++ kernel/bpf/btf.c | 45 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9c1674973e03..4d32f125f4af 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -727,10 +727,14 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_REG_ARGS 5 +/* The argument is a structure. */ +#define BTF_FMODEL_STRUCT_ARG BIT(0) + struct btf_func_model { u8 ret_size; u8 nr_args; u8 arg_size[MAX_BPF_FUNC_ARGS]; + u8 arg_flags[MAX_BPF_FUNC_ARGS]; }; /* Restore arguments before returning from trampoline to let original function diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 903719b89238..ea94527e5d70 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5328,6 +5328,34 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t) return btf_type_is_int(t); } +static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) +{ + const struct btf_param *args; + const struct btf_type *t; + u32 offset = 0, nr_args; + int i; + + if (!func_proto) + return off / 8; + + nr_args = btf_type_vlen(func_proto); + args = (const struct btf_param *)(func_proto + 1); + for (i = 0; i < nr_args; i++) { + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); + if (off < offset) + return i; + } + + t = btf_type_skip_modifiers(btf, func_proto->type, NULL); + offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); + if (off < offset) + return nr_args; + + return nr_args + 1; +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -5347,7 +5375,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = off / 8; + arg = get_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. @@ -5417,7 +5445,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_is_any_enum(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -5881,7 +5909,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_is_any_enum(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) return t->size; return -EINVAL; } @@ -5901,8 +5929,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, /* BTF function prototype doesn't match the verifier types. * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { m->arg_size[i] = 8; + m->arg_flags[i] = 0; + } m->ret_size = 8; m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; @@ -5916,7 +5946,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, func->type, &t); - if (ret < 0) { + if (ret < 0 || __btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", tname, btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -5932,7 +5962,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, args[i].type, &t); - if (ret < 0) { + + /* No support of struct argument size greater than 16 bytes */ + if (ret < 0 || ret > 16) { bpf_log(log, "The function %s arg%d type %s is unsupported.\n", tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -5945,6 +5977,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } m->arg_size[i] = ret; + m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0; } m->nr_args = nargs; return 0; From a9c5ad31fbdc4dec6d266fe22e51de1ad6d1bcf2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:26:52 -0700 Subject: [PATCH 003/143] bpf: x86: Support in-register struct arguments in trampoline programs In C, struct value can be passed as a function argument. For small structs, struct value may be passed in one or more registers. For trampoline based bpf programs, this would cause complication since one-to-one mapping between function argument and arch argument register is not valid any more. The latest llvm16 added bpf support to pass by values for struct up to 16 bytes ([1]). This is also true for x86_64 architecture where two registers will hold the struct value if the struct size is >8 and <= 16. This may not be true if one of struct member is 'double' type but in current linux source code we don't have such instance yet, so we assume all >8 && <= 16 struct holds two general purpose argument registers. Also change on-stack nr_args value to the number of registers holding the arguments. This will permit bpf_get_func_arg() helper to get all argument values. [1] https://reviews.llvm.org/D132144 Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152652.2078600-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 68 +++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index c1f6c1c51d99..ae89f4143eb4 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1751,34 +1751,60 @@ emit_jmp: static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args, int stack_size) { - int i; + int i, j, arg_size, nr_regs; /* Store function arguments to stack. * For a function that accepts two pointers the sequence will be: * mov QWORD PTR [rbp-0x10],rdi * mov QWORD PTR [rbp-0x8],rsi */ - for (i = 0; i < min(nr_args, 6); i++) - emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]), - BPF_REG_FP, - i == 5 ? X86_REG_R9 : BPF_REG_1 + i, - -(stack_size - i * 8)); + for (i = 0, j = 0; i < min(nr_args, 6); i++) { + if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) { + nr_regs = (m->arg_size[i] + 7) / 8; + arg_size = 8; + } else { + nr_regs = 1; + arg_size = m->arg_size[i]; + } + + while (nr_regs) { + emit_stx(prog, bytes_to_bpf_size(arg_size), + BPF_REG_FP, + j == 5 ? X86_REG_R9 : BPF_REG_1 + j, + -(stack_size - j * 8)); + nr_regs--; + j++; + } + } } static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, int stack_size) { - int i; + int i, j, arg_size, nr_regs; /* Restore function arguments from stack. * For a function that accepts two pointers the sequence will be: * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] */ - for (i = 0; i < min(nr_args, 6); i++) - emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]), - i == 5 ? X86_REG_R9 : BPF_REG_1 + i, - BPF_REG_FP, - -(stack_size - i * 8)); + for (i = 0, j = 0; i < min(nr_args, 6); i++) { + if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) { + nr_regs = (m->arg_size[i] + 7) / 8; + arg_size = 8; + } else { + nr_regs = 1; + arg_size = m->arg_size[i]; + } + + while (nr_regs) { + emit_ldx(prog, bytes_to_bpf_size(arg_size), + j == 5 ? X86_REG_R9 : BPF_REG_1 + j, + BPF_REG_FP, + -(stack_size - j * 8)); + nr_regs--; + j++; + } + } } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, @@ -2015,7 +2041,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_links *tlinks, void *orig_call) { - int ret, i, nr_args = m->nr_args; + int ret, i, nr_args = m->nr_args, extra_nregs = 0; int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; @@ -2028,6 +2054,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (nr_args > 6) return -ENOTSUPP; + for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) { + if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) + extra_nregs += (m->arg_size[i] + 7) / 8 - 1; + } + if (nr_args + extra_nregs > 6) + return -ENOTSUPP; + stack_size += extra_nregs * 8; + /* Generated trampoline stack layout: * * RBP + 8 [ return address ] @@ -2040,7 +2074,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * [ ... ] * RBP - regs_off [ reg_arg1 ] program's ctx pointer * - * RBP - args_off [ args count ] always + * RBP - args_off [ arg regs count ] always * * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * @@ -2083,11 +2117,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ EMIT1(0x53); /* push rbx */ - /* Store number of arguments of the traced function: - * mov rax, nr_args + /* Store number of argument registers of the traced function: + * mov rax, nr_args + extra_nregs * mov QWORD PTR [rbp - args_off], rax */ - emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args); + emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args + extra_nregs); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off); if (flags & BPF_TRAMP_F_IP_ARG) { From 27ed9353aec9de4277b3389c9f2b04beb6ab7622 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:26:57 -0700 Subject: [PATCH 004/143] bpf: Update descriptions for helpers bpf_get_func_arg[_cnt]() Now instead of the number of arguments, the number of registers holding argument values are stored in trampoline. Update the description of bpf_get_func_arg[_cnt]() helpers. Previous programs without struct arguments should continue to work as usual. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152657.2078805-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 +++++---- tools/include/uapi/linux/bpf.h | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 793103b10eab..3df78c56c1bf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5079,12 +5079,12 @@ union bpf_attr { * * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) * Description - * Get **n**-th argument (zero based) of the traced function (for tracing programs) + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) * returned in **value**. * * Return * 0 on success. - * **-EINVAL** if n >= arguments count of traced function. + * **-EINVAL** if n >= argument register count of traced function. * * long bpf_get_func_ret(void *ctx, u64 *value) * Description @@ -5097,10 +5097,11 @@ union bpf_attr { * * long bpf_get_func_arg_cnt(void *ctx) * Description - * Get number of arguments of the traced function (for tracing programs). + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. * * Return - * The number of arguments of the traced function. + * The number of argument registers of the traced function. * * int bpf_get_retval(void) * Description diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 793103b10eab..3df78c56c1bf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5079,12 +5079,12 @@ union bpf_attr { * * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) * Description - * Get **n**-th argument (zero based) of the traced function (for tracing programs) + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) * returned in **value**. * * Return * 0 on success. - * **-EINVAL** if n >= arguments count of traced function. + * **-EINVAL** if n >= argument register count of traced function. * * long bpf_get_func_ret(void *ctx, u64 *value) * Description @@ -5097,10 +5097,11 @@ union bpf_attr { * * long bpf_get_func_arg_cnt(void *ctx) * Description - * Get number of arguments of the traced function (for tracing programs). + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. * * Return - * The number of arguments of the traced function. + * The number of argument registers of the traced function. * * int bpf_get_retval(void) * Description From eb707dde264af5eb0271156d7fbd59133fa02cac Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:27:02 -0700 Subject: [PATCH 005/143] bpf: arm64: No support of struct argument in trampoline programs ARM64 does not support struct argument for trampoline based bpf programs yet. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152702.2079066-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 389623ae5a91..30f76178608b 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1970,7 +1970,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, u32 flags, struct bpf_tramp_links *tlinks, void *orig_call) { - int ret; + int i, ret; int nargs = m->nr_args; int max_insns = ((long)image_end - (long)image) / AARCH64_INSN_SIZE; struct jit_ctx ctx = { @@ -1982,6 +1982,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, if (nargs > 8) return -ENOTSUPP; + /* don't support struct argument */ + for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) { + if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) + return -ENOTSUPP; + } + ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nargs, flags); if (ret < 0) return ret; From 34586d29f8dfc4ae30642c5b9a4db8a4a7af6869 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:27:07 -0700 Subject: [PATCH 006/143] libbpf: Add new BPF_PROG2 macro To support struct arguments in trampoline based programs, existing BPF_PROG doesn't work any more since the type size is needed to find whether a parameter takes one or two registers. So this patch added a new BPF_PROG2 macro to support such trampoline programs. The idea is suggested by Andrii. For example, if the to-be-traced function has signature like typedef struct { void *x; int t; } sockptr; int blah(sockptr x, char y); In the new BPF_PROG2 macro, the argument can be represented as __bpf_prog_call( ({ union { struct { __u64 x, y; } ___z; sockptr x; } ___tmp = { .___z = { ctx[0], ctx[1] }}; ___tmp.x; }), ({ union { struct { __u8 x; } ___z; char y; } ___tmp = { .___z = { ctx[2] }}; ___tmp.y; })); In the above, the values stored on the stack are properly assigned to the actual argument type value by using 'union' magic. Note that the macro also works even if no arguments are with struct types. Note that new BPF_PROG2 works for both llvm16 and pre-llvm16 compilers where llvm16 supports bpf target passing value with struct up to 16 byte size and pre-llvm16 will pass by reference by storing values on the stack. With static functions with struct argument as always inline, the compiler is able to optimize and remove additional stack saving of struct values. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152707.2079473-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf_tracing.h | 79 +++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 5fdb93da423b..8d4bdd18cb3d 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -438,6 +438,85 @@ typeof(name(0)) name(unsigned long long *ctx) \ static __always_inline typeof(name(0)) \ ____##name(unsigned long long *ctx, ##args) +#ifndef ____bpf_nth +#define ____bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N +#endif +#ifndef ____bpf_narg +#define ____bpf_narg(...) ____bpf_nth(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0) +#endif + +#define BPF_REG_CNT(t) \ + (__builtin_choose_expr(sizeof(t) == 1 || sizeof(t) == 2 || sizeof(t) == 4 || sizeof(t) == 8, 1, \ + __builtin_choose_expr(sizeof(t) == 16, 2, \ + (void)0))) + +#define ____bpf_reg_cnt0() (0) +#define ____bpf_reg_cnt1(t, x) (____bpf_reg_cnt0() + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt2(t, x, args...) (____bpf_reg_cnt1(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt3(t, x, args...) (____bpf_reg_cnt2(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt4(t, x, args...) (____bpf_reg_cnt3(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt5(t, x, args...) (____bpf_reg_cnt4(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt6(t, x, args...) (____bpf_reg_cnt5(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt7(t, x, args...) (____bpf_reg_cnt6(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt8(t, x, args...) (____bpf_reg_cnt7(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt9(t, x, args...) (____bpf_reg_cnt8(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt10(t, x, args...) (____bpf_reg_cnt9(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt11(t, x, args...) (____bpf_reg_cnt10(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt12(t, x, args...) (____bpf_reg_cnt11(args) + BPF_REG_CNT(t)) +#define ____bpf_reg_cnt(args...) ___bpf_apply(____bpf_reg_cnt, ____bpf_narg(args))(args) + +#define ____bpf_union_arg(t, x, n) \ + __builtin_choose_expr(sizeof(t) == 1, ({ union { struct { __u8 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]}}; ___tmp.x; }), \ + __builtin_choose_expr(sizeof(t) == 2, ({ union { struct { __u16 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]} }; ___tmp.x; }), \ + __builtin_choose_expr(sizeof(t) == 4, ({ union { struct { __u32 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]} }; ___tmp.x; }), \ + __builtin_choose_expr(sizeof(t) == 8, ({ union { struct { __u64 x; } ___z; t x; } ___tmp = {.___z = {ctx[n]} }; ___tmp.x; }), \ + __builtin_choose_expr(sizeof(t) == 16, ({ union { struct { __u64 x, y; } ___z; t x; } ___tmp = {.___z = {ctx[n], ctx[n + 1]} }; ___tmp.x; }), \ + (void)0))))) + +#define ____bpf_ctx_arg0(n, args...) +#define ____bpf_ctx_arg1(n, t, x) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt1(t, x)) +#define ____bpf_ctx_arg2(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt2(t, x, args)) ____bpf_ctx_arg1(n, args) +#define ____bpf_ctx_arg3(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt3(t, x, args)) ____bpf_ctx_arg2(n, args) +#define ____bpf_ctx_arg4(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt4(t, x, args)) ____bpf_ctx_arg3(n, args) +#define ____bpf_ctx_arg5(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt5(t, x, args)) ____bpf_ctx_arg4(n, args) +#define ____bpf_ctx_arg6(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt6(t, x, args)) ____bpf_ctx_arg5(n, args) +#define ____bpf_ctx_arg7(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt7(t, x, args)) ____bpf_ctx_arg6(n, args) +#define ____bpf_ctx_arg8(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt8(t, x, args)) ____bpf_ctx_arg7(n, args) +#define ____bpf_ctx_arg9(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt9(t, x, args)) ____bpf_ctx_arg8(n, args) +#define ____bpf_ctx_arg10(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt10(t, x, args)) ____bpf_ctx_arg9(n, args) +#define ____bpf_ctx_arg11(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt11(t, x, args)) ____bpf_ctx_arg10(n, args) +#define ____bpf_ctx_arg12(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt12(t, x, args)) ____bpf_ctx_arg11(n, args) +#define ____bpf_ctx_arg(n, args...) ___bpf_apply(____bpf_ctx_arg, ____bpf_narg(args))(n, args) + +#define ____bpf_ctx_decl0() +#define ____bpf_ctx_decl1(t, x) , t x +#define ____bpf_ctx_decl2(t, x, args...) , t x ____bpf_ctx_decl1(args) +#define ____bpf_ctx_decl3(t, x, args...) , t x ____bpf_ctx_decl2(args) +#define ____bpf_ctx_decl4(t, x, args...) , t x ____bpf_ctx_decl3(args) +#define ____bpf_ctx_decl5(t, x, args...) , t x ____bpf_ctx_decl4(args) +#define ____bpf_ctx_decl6(t, x, args...) , t x ____bpf_ctx_decl5(args) +#define ____bpf_ctx_decl7(t, x, args...) , t x ____bpf_ctx_decl6(args) +#define ____bpf_ctx_decl8(t, x, args...) , t x ____bpf_ctx_decl7(args) +#define ____bpf_ctx_decl9(t, x, args...) , t x ____bpf_ctx_decl8(args) +#define ____bpf_ctx_decl10(t, x, args...) , t x ____bpf_ctx_decl9(args) +#define ____bpf_ctx_decl11(t, x, args...) , t x ____bpf_ctx_decl10(args) +#define ____bpf_ctx_decl12(t, x, args...) , t x ____bpf_ctx_decl11(args) +#define ____bpf_ctx_decl(args...) ___bpf_apply(____bpf_ctx_decl, ____bpf_narg(args))(args) + +/* + * BPF_PROG2 can handle struct arguments. + */ +#define BPF_PROG2(name, args...) \ +name(unsigned long long *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ____bpf_ctx_decl(args)); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + return ____##name(ctx ____bpf_ctx_arg(____bpf_reg_cnt(args), args)); \ +} \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ____bpf_ctx_decl(args)) + struct pt_regs; #define ___bpf_kprobe_args0() ctx From 1642a3945e223a922312fab2401ecdf58b3825b9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:27:13 -0700 Subject: [PATCH 007/143] selftests/bpf: Add struct argument tests with fentry/fexit programs. Add various struct argument tests with fentry/fexit programs. Also add one test with a kernel func which does not have any argument to test BPF_PROG2 macro in such situation. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152713.2080039-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 48 +++++++ .../selftests/bpf/prog_tests/tracing_struct.c | 63 +++++++++ .../selftests/bpf/progs/tracing_struct.c | 120 ++++++++++++++++++ 3 files changed, 231 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tracing_struct.c create mode 100644 tools/testing/selftests/bpf/progs/tracing_struct.c diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 792cb15bac40..a6021d6117b5 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -18,6 +18,46 @@ typedef int (*func_proto_typedef_nested1)(func_proto_typedef); typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1); DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123; +long bpf_testmod_test_struct_arg_result; + +struct bpf_testmod_struct_arg_1 { + int a; +}; +struct bpf_testmod_struct_arg_2 { + long a; + long b; +}; + +noinline int +bpf_testmod_test_struct_arg_1(struct bpf_testmod_struct_arg_2 a, int b, int c) { + bpf_testmod_test_struct_arg_result = a.a + a.b + b + c; + return bpf_testmod_test_struct_arg_result; +} + +noinline int +bpf_testmod_test_struct_arg_2(int a, struct bpf_testmod_struct_arg_2 b, int c) { + bpf_testmod_test_struct_arg_result = a + b.a + b.b + c; + return bpf_testmod_test_struct_arg_result; +} + +noinline int +bpf_testmod_test_struct_arg_3(int a, int b, struct bpf_testmod_struct_arg_2 c) { + bpf_testmod_test_struct_arg_result = a + b + c.a + c.b; + return bpf_testmod_test_struct_arg_result; +} + +noinline int +bpf_testmod_test_struct_arg_4(struct bpf_testmod_struct_arg_1 a, int b, + int c, int d, struct bpf_testmod_struct_arg_2 e) { + bpf_testmod_test_struct_arg_result = a.a + b + c + d + e.a + e.b; + return bpf_testmod_test_struct_arg_result; +} + +noinline int +bpf_testmod_test_struct_arg_5(void) { + bpf_testmod_test_struct_arg_result = 1; + return bpf_testmod_test_struct_arg_result; +} noinline void bpf_testmod_test_mod_kfunc(int i) @@ -98,11 +138,19 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, .off = off, .len = len, }; + struct bpf_testmod_struct_arg_1 struct_arg1 = {10}; + struct bpf_testmod_struct_arg_2 struct_arg2 = {2, 3}; int i = 1; while (bpf_testmod_return_ptr(i)) i++; + (void)bpf_testmod_test_struct_arg_1(struct_arg2, 1, 4); + (void)bpf_testmod_test_struct_arg_2(1, struct_arg2, 4); + (void)bpf_testmod_test_struct_arg_3(1, 4, struct_arg2); + (void)bpf_testmod_test_struct_arg_4(struct_arg1, 1, 2, 3, struct_arg2); + (void)bpf_testmod_test_struct_arg_5(); + /* This is always true. Use the check to make sure the compiler * doesn't remove bpf_testmod_loop_test. */ diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_struct.c b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c new file mode 100644 index 000000000000..d5022b91d1e4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include "tracing_struct.skel.h" + +static void test_fentry(void) +{ + struct tracing_struct *skel; + int err; + + skel = tracing_struct__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_struct__open_and_load")) + return; + + err = tracing_struct__attach(skel); + if (!ASSERT_OK(err, "tracing_struct__attach")) + return; + + ASSERT_OK(trigger_module_test_read(256), "trigger_read"); + + ASSERT_EQ(skel->bss->t1_a_a, 2, "t1:a.a"); + ASSERT_EQ(skel->bss->t1_a_b, 3, "t1:a.b"); + ASSERT_EQ(skel->bss->t1_b, 1, "t1:b"); + ASSERT_EQ(skel->bss->t1_c, 4, "t1:c"); + + ASSERT_EQ(skel->bss->t1_nregs, 4, "t1 nregs"); + ASSERT_EQ(skel->bss->t1_reg0, 2, "t1 reg0"); + ASSERT_EQ(skel->bss->t1_reg1, 3, "t1 reg1"); + ASSERT_EQ(skel->bss->t1_reg2, 1, "t1 reg2"); + ASSERT_EQ(skel->bss->t1_reg3, 4, "t1 reg3"); + ASSERT_EQ(skel->bss->t1_ret, 10, "t1 ret"); + + ASSERT_EQ(skel->bss->t2_a, 1, "t2:a"); + ASSERT_EQ(skel->bss->t2_b_a, 2, "t2:b.a"); + ASSERT_EQ(skel->bss->t2_b_b, 3, "t2:b.b"); + ASSERT_EQ(skel->bss->t2_c, 4, "t2:c"); + ASSERT_EQ(skel->bss->t2_ret, 10, "t2 ret"); + + ASSERT_EQ(skel->bss->t3_a, 1, "t3:a"); + ASSERT_EQ(skel->bss->t3_b, 4, "t3:b"); + ASSERT_EQ(skel->bss->t3_c_a, 2, "t3:c.a"); + ASSERT_EQ(skel->bss->t3_c_b, 3, "t3:c.b"); + ASSERT_EQ(skel->bss->t3_ret, 10, "t3 ret"); + + ASSERT_EQ(skel->bss->t4_a_a, 10, "t4:a.a"); + ASSERT_EQ(skel->bss->t4_b, 1, "t4:b"); + ASSERT_EQ(skel->bss->t4_c, 2, "t4:c"); + ASSERT_EQ(skel->bss->t4_d, 3, "t4:d"); + ASSERT_EQ(skel->bss->t4_e_a, 2, "t4:e.a"); + ASSERT_EQ(skel->bss->t4_e_b, 3, "t4:e.b"); + ASSERT_EQ(skel->bss->t4_ret, 21, "t4 ret"); + + ASSERT_EQ(skel->bss->t5_ret, 1, "t5 ret"); + + tracing_struct__detach(skel); + tracing_struct__destroy(skel); +} + +void test_tracing_struct(void) +{ + test_fentry(); +} diff --git a/tools/testing/selftests/bpf/progs/tracing_struct.c b/tools/testing/selftests/bpf/progs/tracing_struct.c new file mode 100644 index 000000000000..e718f0ebee7d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_struct.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + +struct bpf_testmod_struct_arg_1 { + int a; +}; +struct bpf_testmod_struct_arg_2 { + long a; + long b; +}; + +long t1_a_a, t1_a_b, t1_b, t1_c, t1_ret, t1_nregs; +__u64 t1_reg0, t1_reg1, t1_reg2, t1_reg3; +long t2_a, t2_b_a, t2_b_b, t2_c, t2_ret; +long t3_a, t3_b, t3_c_a, t3_c_b, t3_ret; +long t4_a_a, t4_b, t4_c, t4_d, t4_e_a, t4_e_b, t4_ret; +long t5_ret; + +SEC("fentry/bpf_testmod_test_struct_arg_1") +int BPF_PROG2(test_struct_arg_1, struct bpf_testmod_struct_arg_2, a, int, b, int, c) +{ + t1_a_a = a.a; + t1_a_b = a.b; + t1_b = b; + t1_c = c; + return 0; +} + +SEC("fexit/bpf_testmod_test_struct_arg_1") +int BPF_PROG2(test_struct_arg_2, struct bpf_testmod_struct_arg_2, a, int, b, int, c, int, ret) +{ + t1_nregs = bpf_get_func_arg_cnt(ctx); + /* a.a */ + bpf_get_func_arg(ctx, 0, &t1_reg0); + /* a.b */ + bpf_get_func_arg(ctx, 1, &t1_reg1); + /* b */ + bpf_get_func_arg(ctx, 2, &t1_reg2); + t1_reg2 = (int)t1_reg2; + /* c */ + bpf_get_func_arg(ctx, 3, &t1_reg3); + t1_reg3 = (int)t1_reg3; + + t1_ret = ret; + return 0; +} + +SEC("fentry/bpf_testmod_test_struct_arg_2") +int BPF_PROG2(test_struct_arg_3, int, a, struct bpf_testmod_struct_arg_2, b, int, c) +{ + t2_a = a; + t2_b_a = b.a; + t2_b_b = b.b; + t2_c = c; + return 0; +} + +SEC("fexit/bpf_testmod_test_struct_arg_2") +int BPF_PROG2(test_struct_arg_4, int, a, struct bpf_testmod_struct_arg_2, b, int, c, int, ret) +{ + t2_ret = ret; + return 0; +} + +SEC("fentry/bpf_testmod_test_struct_arg_3") +int BPF_PROG2(test_struct_arg_5, int, a, int, b, struct bpf_testmod_struct_arg_2, c) +{ + t3_a = a; + t3_b = b; + t3_c_a = c.a; + t3_c_b = c.b; + return 0; +} + +SEC("fexit/bpf_testmod_test_struct_arg_3") +int BPF_PROG2(test_struct_arg_6, int, a, int, b, struct bpf_testmod_struct_arg_2, c, int, ret) +{ + t3_ret = ret; + return 0; +} + +SEC("fentry/bpf_testmod_test_struct_arg_4") +int BPF_PROG2(test_struct_arg_7, struct bpf_testmod_struct_arg_1, a, int, b, + int, c, int, d, struct bpf_testmod_struct_arg_2, e) +{ + t4_a_a = a.a; + t4_b = b; + t4_c = c; + t4_d = d; + t4_e_a = e.a; + t4_e_b = e.b; + return 0; +} + +SEC("fexit/bpf_testmod_test_struct_arg_4") +int BPF_PROG2(test_struct_arg_8, struct bpf_testmod_struct_arg_1, a, int, b, + int, c, int, d, struct bpf_testmod_struct_arg_2, e, int, ret) +{ + t4_ret = ret; + return 0; +} + +SEC("fentry/bpf_testmod_test_struct_arg_5") +int BPF_PROG2(test_struct_arg_9) +{ + return 0; +} + +SEC("fexit/bpf_testmod_test_struct_arg_5") +int BPF_PROG2(test_struct_arg_10, int, ret) +{ + t5_ret = ret; + return 0; +} + +char _license[] SEC("license") = "GPL"; From a7c2ca3a2f697044094475055b3fba3929b234e4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:27:18 -0700 Subject: [PATCH 008/143] selftests/bpf: Use BPF_PROG2 for some fentry programs without struct arguments Use BPF_PROG2 instead of BPF_PROG for programs in progs/timer.c to test BPF_PROG2 for cases without struct arguments. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152718.2081091-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index 0053c5402173..acda5c9cea93 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -120,7 +120,7 @@ static int timer_cb1(void *map, int *key, struct bpf_timer *timer) } SEC("fentry/bpf_fentry_test1") -int BPF_PROG(test1, int a) +int BPF_PROG2(test1, int, a) { struct bpf_timer *arr_timer, *lru_timer; struct elem init = {}; @@ -236,7 +236,7 @@ int bpf_timer_test(void) } SEC("fentry/bpf_fentry_test2") -int BPF_PROG(test2, int a, int b) +int BPF_PROG2(test2, int, a, int, b) { struct hmap_elem init = {}, *val; int key = HTAB, key_malloc = HTAB_MALLOC; From ae63c10fc241a94bb916da96d40c8810f9ad7f18 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:27:23 -0700 Subject: [PATCH 009/143] selftests/bpf: Add tracing_struct test in DENYLIST.s390x Add tracing_struct test in DENYLIST.s390x since s390x does not support trampoline now. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152723.2081551-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 18fbb6eab1e2..168c5b287b5c 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -70,3 +70,4 @@ setget_sockopt # attach unexpected error: -524 cb_refs # expected error message unexpected error: -524 (trampoline) cgroup_hierarchical_stats # JIT does not support calling kernel function (kfunc) htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) +tracing_struct # failed to auto-attach: -524 (trampoline) From 012ba1156e4a7b38062d109b818cb479a68c87ba Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:12:57 +0200 Subject: [PATCH 010/143] selftests/bpf: regroup and declare similar kfuncs selftests in an array Similar to tools/testing/selftests/bpf/prog_tests/dynptr.c: we declare an array of tests that we run one by one in a for loop. Followup patches will add more similar-ish tests, so avoid a lot of copy paste by grouping the declaration in an array. For light skeletons, we have to rely on the offsetof() macro so we can statically declare which program we are using. In the libbpf case, we can rely on bpf_object__find_program_by_name(). So also change the Makefile to generate both light skeletons and normal ones. Signed-off-by: Benjamin Tissoires Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220906151303.2780789-2-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 5 +- .../selftests/bpf/prog_tests/kfunc_call.c | 81 +++++++++++++++---- 2 files changed, 68 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c10adecb5a73..6cd327f1f216 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -351,11 +351,12 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ test_subskeleton.skel.h test_subskeleton_lib.skel.h \ test_usdt.skel.h -LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \ +LSKELS := fentry_test.c fexit_test.c fexit_sleep.c \ test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c \ map_ptr_kern.c core_kern.c core_kern_overflow.c # Generate both light skeleton and libbpf skeleton for these -LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c +LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ + kfunc_call_test_subprog.c SKEL_BLACKLIST += $$(LSKELS) test_static_linked.skel.h-deps := test_static_linked1.bpf.o test_static_linked2.bpf.o diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index eede7c304f86..9dfbe5355a2d 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -2,6 +2,7 @@ /* Copyright (c) 2021 Facebook */ #include #include +#include "kfunc_call_test.skel.h" #include "kfunc_call_test.lskel.h" #include "kfunc_call_test_subprog.skel.h" #include "kfunc_call_test_subprog.lskel.h" @@ -9,9 +10,31 @@ #include "cap_helpers.h" -static void test_main(void) +struct kfunc_test_params { + const char *prog_name; + unsigned long lskel_prog_desc_offset; + int retval; +}; + +#define TC_TEST(name, __retval) \ + { \ + .prog_name = #name, \ + .lskel_prog_desc_offset = offsetof(struct kfunc_call_test_lskel, progs.name), \ + .retval = __retval, \ + } + +static struct kfunc_test_params kfunc_tests[] = { + TC_TEST(kfunc_call_test1, 12), + TC_TEST(kfunc_call_test2, 3), + TC_TEST(kfunc_call_test_ref_btf_id, 0), +}; + +static void verify_success(struct kfunc_test_params *param) { - struct kfunc_call_test_lskel *skel; + struct kfunc_call_test_lskel *lskel = NULL; + struct bpf_prog_desc *lskel_prog; + struct kfunc_call_test *skel; + struct bpf_program *prog; int prog_fd, err; LIBBPF_OPTS(bpf_test_run_opts, topts, .data_in = &pkt_v4, @@ -19,26 +42,53 @@ static void test_main(void) .repeat = 1, ); - skel = kfunc_call_test_lskel__open_and_load(); + /* first test with normal libbpf */ + skel = kfunc_call_test__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel")) return; - prog_fd = skel->progs.kfunc_call_test1.prog_fd; - err = bpf_prog_test_run_opts(prog_fd, &topts); - ASSERT_OK(err, "bpf_prog_test_run(test1)"); - ASSERT_EQ(topts.retval, 12, "test1-retval"); + prog = bpf_object__find_program_by_name(skel->obj, param->prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; - prog_fd = skel->progs.kfunc_call_test2.prog_fd; + prog_fd = bpf_program__fd(prog); err = bpf_prog_test_run_opts(prog_fd, &topts); - ASSERT_OK(err, "bpf_prog_test_run(test2)"); - ASSERT_EQ(topts.retval, 3, "test2-retval"); + if (!ASSERT_OK(err, param->prog_name)) + goto cleanup; - prog_fd = skel->progs.kfunc_call_test_ref_btf_id.prog_fd; + if (!ASSERT_EQ(topts.retval, param->retval, "retval")) + goto cleanup; + + /* second test with light skeletons */ + lskel = kfunc_call_test_lskel__open_and_load(); + if (!ASSERT_OK_PTR(lskel, "lskel")) + goto cleanup; + + lskel_prog = (struct bpf_prog_desc *)((char *)lskel + param->lskel_prog_desc_offset); + + prog_fd = lskel_prog->prog_fd; err = bpf_prog_test_run_opts(prog_fd, &topts); - ASSERT_OK(err, "bpf_prog_test_run(test_ref_btf_id)"); - ASSERT_EQ(topts.retval, 0, "test_ref_btf_id-retval"); + if (!ASSERT_OK(err, param->prog_name)) + goto cleanup; - kfunc_call_test_lskel__destroy(skel); + ASSERT_EQ(topts.retval, param->retval, "retval"); + +cleanup: + kfunc_call_test__destroy(skel); + if (lskel) + kfunc_call_test_lskel__destroy(lskel); +} + +static void test_main(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(kfunc_tests); i++) { + if (!test__start_subtest(kfunc_tests[i].prog_name)) + continue; + + verify_success(&kfunc_tests[i]); + } } static void test_subprog(void) @@ -121,8 +171,7 @@ static void test_destructive(void) void test_kfunc_call(void) { - if (test__start_subtest("main")) - test_main(); + test_main(); if (test__start_subtest("subprog")) test_subprog(); From 95f2f26f3cac06cfc046d2b29e60719d7848ea54 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:12:58 +0200 Subject: [PATCH 011/143] bpf: split btf_check_subprog_arg_match in two btf_check_subprog_arg_match() was used twice in verifier.c: - when checking for the type mismatches between a (sub)prog declaration and BTF - when checking the call of a subprog to see if the provided arguments are correct and valid This is problematic when we check if the first argument of a program (pointer to ctx) is correctly accessed: To be able to ensure we access a valid memory in the ctx, the verifier assumes the pointer to context is not null. This has the side effect of marking the program accessing the entire context, even if the context is never dereferenced. For example, by checking the context access with the current code, the following eBPF program would fail with -EINVAL if the ctx is set to null from the userspace: ``` SEC("syscall") int prog(struct my_ctx *args) { return 0; } ``` In that particular case, we do not want to actually check that the memory is correct while checking for the BTF validity, but we just want to ensure that the (sub)prog definition matches the BTF we have. So split btf_check_subprog_arg_match() in two so we can actually check for the memory used when in a call, and ignore that part when not. Note that a further patch is in preparation to disentangled btf_check_func_arg_match() from these two purposes, and so right now we just add a new hack around that by adding a boolean to this function. Signed-off-by: Benjamin Tissoires Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220906151303.2780789-3-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/btf.c | 54 +++++++++++++++++++++++++++++++++++++++---- kernel/bpf/verifier.c | 2 +- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4d32f125f4af..3cf161cfd396 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1947,6 +1947,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); +int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs); int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ea94527e5d70..9291e2b2c950 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6203,7 +6203,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok, - u32 kfunc_flags) + u32 kfunc_flags, + bool processing_call) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); bool rel = false, kptr_get = false, trusted_arg = false; @@ -6389,7 +6390,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname); return -EINVAL; } - } else if (ptr_to_mem_ok) { + } else if (ptr_to_mem_ok && processing_call) { const struct btf_type *resolve_ret; u32 type_size; @@ -6464,7 +6465,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return rel ? ref_regno : 0; } -/* Compare BTF of a function with given bpf_reg_state. +/* Compare BTF of a function declaration with given bpf_reg_state. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - there is a type mismatch or BTF is not available. @@ -6491,7 +6492,50 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, false); + + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ + if (err) + prog->aux->func_info_aux[subprog].unreliable = true; + return err; +} + +/* Compare BTF of a function call with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + * + * NOTE: the code is duplicated from btf_check_subprog_arg_match() + * because btf_check_func_arg_match() is still doing both. Once that + * function is split in 2, we can call from here btf_check_subprog_arg_match() + * first, and then treat the calling part in a new code path. + */ +int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs) +{ + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + bool is_global; + u32 btf_id; + int err; + + if (!prog->aux->func_info) + return -EINVAL; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return -EFAULT; + + if (prog->aux->func_info_aux[subprog].unreliable) + return -EINVAL; + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, true); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6507,7 +6551,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 kfunc_flags) { - return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags); + return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags, true); } /* Convert BTF of a function into bpf_reg_state if possible diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 003f7ba19558..7d9a2e18ca8a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6629,7 +6629,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_info_aux = env->prog->aux->func_info_aux; if (func_info_aux) is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_subprog_arg_match(env, subprog, caller->regs); + err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; if (is_global) { From 15baa55ff5b00b81bcd9874b89cb8e0b0daaa13d Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:12:59 +0200 Subject: [PATCH 012/143] bpf/verifier: allow all functions to read user provided context When a function was trying to access data from context in a syscall eBPF program, the verifier was rejecting the call unless it was accessing the first element. This is because the syscall context is not known at compile time, and so we need to check this when actually accessing it. Check for the valid memory access if there is no convert_ctx callback, and allow such situation to happen. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220906151303.2780789-4-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d9a2e18ca8a..3cfe60206de6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5233,6 +5233,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER, meta); + case PTR_TO_CTX: + /* in case the function doesn't know how to access the context, + * (because we are in a program of type SYSCALL for example), we + * can not statically check its size. + * Dynamically check it now. + */ + if (!env->ops->convert_ctx_access) { + enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ; + int offset = access_size - 1; + + /* Allow zero-byte read from PTR_TO_CTX */ + if (access_size == 0) + return zero_size_allowed ? 0 : -EACCES; + + return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, + atype, -1, false); + } + + fallthrough; default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && From fb66223a244f252273995557b23e0fa53092e92c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:13:00 +0200 Subject: [PATCH 013/143] selftests/bpf: add test for accessing ctx from syscall program type We need to also export the kfunc set to the syscall program type, and then add a couple of eBPF programs that are testing those calls. The first one checks for valid access, and the second one is OK from a static analysis point of view but fails at run time because we are trying to access outside of the allocated memory. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220906151303.2780789-5-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 1 + .../selftests/bpf/prog_tests/kfunc_call.c | 143 +++++++++++++++++- .../selftests/bpf/progs/kfunc_call_fail.c | 39 +++++ .../selftests/bpf/progs/kfunc_call_test.c | 38 +++++ 4 files changed, 214 insertions(+), 7 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_fail.c diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 25d8ecf105aa..f16baf977a21 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1634,6 +1634,7 @@ static int __init bpf_prog_test_run_init(void) ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, ARRAY_SIZE(bpf_prog_test_dtor_kfunc), THIS_MODULE); diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index 9dfbe5355a2d..d5881c3331a8 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -2,6 +2,7 @@ /* Copyright (c) 2021 Facebook */ #include #include +#include "kfunc_call_fail.skel.h" #include "kfunc_call_test.skel.h" #include "kfunc_call_test.lskel.h" #include "kfunc_call_test_subprog.skel.h" @@ -10,37 +11,96 @@ #include "cap_helpers.h" +static size_t log_buf_sz = 1048576; /* 1 MB */ +static char obj_log_buf[1048576]; + +enum kfunc_test_type { + tc_test = 0, + syscall_test, + syscall_null_ctx_test, +}; + struct kfunc_test_params { const char *prog_name; unsigned long lskel_prog_desc_offset; int retval; + enum kfunc_test_type test_type; + const char *expected_err_msg; }; -#define TC_TEST(name, __retval) \ +#define __BPF_TEST_SUCCESS(name, __retval, type) \ { \ .prog_name = #name, \ .lskel_prog_desc_offset = offsetof(struct kfunc_call_test_lskel, progs.name), \ .retval = __retval, \ + .test_type = type, \ + .expected_err_msg = NULL, \ } +#define __BPF_TEST_FAIL(name, __retval, type, error_msg) \ + { \ + .prog_name = #name, \ + .lskel_prog_desc_offset = 0 /* unused when test is failing */, \ + .retval = __retval, \ + .test_type = type, \ + .expected_err_msg = error_msg, \ + } + +#define TC_TEST(name, retval) __BPF_TEST_SUCCESS(name, retval, tc_test) +#define SYSCALL_TEST(name, retval) __BPF_TEST_SUCCESS(name, retval, syscall_test) +#define SYSCALL_NULL_CTX_TEST(name, retval) __BPF_TEST_SUCCESS(name, retval, syscall_null_ctx_test) + +#define SYSCALL_NULL_CTX_FAIL(name, retval, error_msg) \ + __BPF_TEST_FAIL(name, retval, syscall_null_ctx_test, error_msg) + static struct kfunc_test_params kfunc_tests[] = { + /* failure cases: + * if retval is 0 -> the program will fail to load and the error message is an error + * if retval is not 0 -> the program can be loaded but running it will gives the + * provided return value. The error message is thus the one + * from a successful load + */ + SYSCALL_NULL_CTX_FAIL(kfunc_syscall_test_fail, -EINVAL, "processed 4 insns"), + SYSCALL_NULL_CTX_FAIL(kfunc_syscall_test_null_fail, -EINVAL, "processed 4 insns"), + + /* success cases */ TC_TEST(kfunc_call_test1, 12), TC_TEST(kfunc_call_test2, 3), TC_TEST(kfunc_call_test_ref_btf_id, 0), + SYSCALL_TEST(kfunc_syscall_test, 0), + SYSCALL_NULL_CTX_TEST(kfunc_syscall_test_null, 0), +}; + +struct syscall_test_args { + __u8 data[16]; + size_t size; }; static void verify_success(struct kfunc_test_params *param) { struct kfunc_call_test_lskel *lskel = NULL; + LIBBPF_OPTS(bpf_test_run_opts, topts); struct bpf_prog_desc *lskel_prog; struct kfunc_call_test *skel; struct bpf_program *prog; int prog_fd, err; - LIBBPF_OPTS(bpf_test_run_opts, topts, - .data_in = &pkt_v4, - .data_size_in = sizeof(pkt_v4), - .repeat = 1, - ); + struct syscall_test_args args = { + .size = 10, + }; + + switch (param->test_type) { + case syscall_test: + topts.ctx_in = &args; + topts.ctx_size_in = sizeof(args); + /* fallthrough */ + case syscall_null_ctx_test: + break; + case tc_test: + topts.data_in = &pkt_v4; + topts.data_size_in = sizeof(pkt_v4); + topts.repeat = 1; + break; + } /* first test with normal libbpf */ skel = kfunc_call_test__open_and_load(); @@ -79,6 +139,72 @@ cleanup: kfunc_call_test_lskel__destroy(lskel); } +static void verify_fail(struct kfunc_test_params *param) +{ + LIBBPF_OPTS(bpf_object_open_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_program *prog; + struct kfunc_call_fail *skel; + int prog_fd, err; + struct syscall_test_args args = { + .size = 10, + }; + + opts.kernel_log_buf = obj_log_buf; + opts.kernel_log_size = log_buf_sz; + opts.kernel_log_level = 1; + + switch (param->test_type) { + case syscall_test: + topts.ctx_in = &args; + topts.ctx_size_in = sizeof(args); + /* fallthrough */ + case syscall_null_ctx_test: + break; + case tc_test: + topts.data_in = &pkt_v4; + topts.data_size_in = sizeof(pkt_v4); + break; + topts.repeat = 1; + } + + skel = kfunc_call_fail__open_opts(&opts); + if (!ASSERT_OK_PTR(skel, "kfunc_call_fail__open_opts")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, param->prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; + + bpf_program__set_autoload(prog, true); + + err = kfunc_call_fail__load(skel); + if (!param->retval) { + /* the verifier is supposed to complain and refuses to load */ + if (!ASSERT_ERR(err, "unexpected load success")) + goto out_err; + + } else { + /* the program is loaded but must dynamically fail */ + if (!ASSERT_OK(err, "unexpected load error")) + goto out_err; + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_EQ(err, param->retval, param->prog_name)) + goto out_err; + } + +out_err: + if (!ASSERT_OK_PTR(strstr(obj_log_buf, param->expected_err_msg), "expected_err_msg")) { + fprintf(stderr, "Expected err_msg: %s\n", param->expected_err_msg); + fprintf(stderr, "Verifier output: %s\n", obj_log_buf); + } + +cleanup: + kfunc_call_fail__destroy(skel); +} + static void test_main(void) { int i; @@ -87,7 +213,10 @@ static void test_main(void) if (!test__start_subtest(kfunc_tests[i].prog_name)) continue; - verify_success(&kfunc_tests[i]); + if (!kfunc_tests[i].expected_err_msg) + verify_success(&kfunc_tests[i]); + else + verify_fail(&kfunc_tests[i]); } } diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c new file mode 100644 index 000000000000..4168027f2ab1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include + +extern void bpf_kfunc_call_test_mem_len_pass1(void *mem, int len) __ksym; + +struct syscall_test_args { + __u8 data[16]; + size_t size; +}; + +SEC("?syscall") +int kfunc_syscall_test_fail(struct syscall_test_args *args) +{ + bpf_kfunc_call_test_mem_len_pass1(&args->data, sizeof(*args) + 1); + + return 0; +} + +SEC("?syscall") +int kfunc_syscall_test_null_fail(struct syscall_test_args *args) +{ + /* Must be called with args as a NULL pointer + * we do not check for it to have the verifier consider that + * the pointer might not be null, and so we can load it. + * + * So the following can not be added: + * + * if (args) + * return -22; + */ + + bpf_kfunc_call_test_mem_len_pass1(args, sizeof(*args)); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c index 5aecbb9fdc68..9e1914916f1d 100644 --- a/tools/testing/selftests/bpf/progs/kfunc_call_test.c +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c @@ -92,4 +92,42 @@ int kfunc_call_test_pass(struct __sk_buff *skb) return 0; } +struct syscall_test_args { + __u8 data[16]; + size_t size; +}; + +SEC("syscall") +int kfunc_syscall_test(struct syscall_test_args *args) +{ + const long size = args->size; + + if (size > sizeof(args->data)) + return -7; /* -E2BIG */ + + bpf_kfunc_call_test_mem_len_pass1(&args->data, sizeof(args->data)); + bpf_kfunc_call_test_mem_len_pass1(&args->data, sizeof(*args)); + bpf_kfunc_call_test_mem_len_pass1(&args->data, size); + + return 0; +} + +SEC("syscall") +int kfunc_syscall_test_null(struct syscall_test_args *args) +{ + /* Must be called with args as a NULL pointer + * we do not check for it to have the verifier consider that + * the pointer might not be null, and so we can load it. + * + * So the following can not be added: + * + * if (args) + * return -22; + */ + + bpf_kfunc_call_test_mem_len_pass1(args, 0); + + return 0; +} + char _license[] SEC("license") = "GPL"; From f9b348185f4d684cc19e6bd9b87904823d5aa5ed Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:13:01 +0200 Subject: [PATCH 014/143] bpf/btf: bump BTF_KFUNC_SET_MAX_CNT net/bpf/test_run.c is already presenting 20 kfuncs. net/netfilter/nf_conntrack_bpf.c is also presenting an extra 10 kfuncs. Given that all the kfuncs are regrouped into one unique set, having only 2 space left prevent us to add more selftests. Bump it to 256. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220906151303.2780789-6-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9291e2b2c950..2c2d8190ca4a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -208,7 +208,7 @@ enum btf_kfunc_hook { }; enum { - BTF_KFUNC_SET_MAX_CNT = 32, + BTF_KFUNC_SET_MAX_CNT = 256, BTF_DTOR_KFUNC_MAX_CNT = 256, }; From eb1f7f71c126c8fd50ea81af98f97c4b581ea4ae Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:13:02 +0200 Subject: [PATCH 015/143] bpf/verifier: allow kfunc to return an allocated mem For drivers (outside of network), the incoming data is not statically defined in a struct. Most of the time the data buffer is kzalloc-ed and thus we can not rely on eBPF and BTF to explore the data. This commit allows to return an arbitrary memory, previously allocated by the driver. An interesting extra point is that the kfunc can mark the exported memory region as read only or read/write. So, when a kfunc is not returning a pointer to a struct but to a plain type, we can consider it is a valid allocated memory assuming that: - one of the arguments is either called rdonly_buf_size or rdwr_buf_size - and this argument is a const from the caller point of view We can then use this parameter as the size of the allocated memory. The memory is either read-only or read-write based on the name of the size parameter. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220906151303.2780789-7-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++- include/linux/bpf_verifier.h | 2 + include/linux/btf.h | 10 ++++ kernel/bpf/btf.c | 101 ++++++++++++++++++++++++++++------- kernel/bpf/verifier.c | 45 +++++++++++----- 5 files changed, 133 insertions(+), 34 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3cf161cfd396..79883f883ff3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1944,6 +1944,13 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); +struct bpf_kfunc_arg_meta { + u64 r0_size; + bool r0_rdonly; + int ref_obj_id; + u32 flags; +}; + struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); @@ -1952,7 +1959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - u32 kfunc_flags); + struct bpf_kfunc_arg_meta *meta); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1fdddbf3546b..8fbc1d05281e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -598,6 +598,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info); void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); +int mark_chain_precision(struct bpf_verifier_env *env, int regno); + #define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) /* extract base type from bpf_{arg, return, reg}_type. */ diff --git a/include/linux/btf.h b/include/linux/btf.h index ad93c2d9cc1c..1fcc833a8690 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -441,4 +441,14 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt } #endif +static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) +{ + if (!btf_type_is_ptr(t)) + return false; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + return btf_type_is_struct(t); +} + #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2c2d8190ca4a..9d12212fcd61 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6199,11 +6199,36 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf, return true; } +static bool btf_is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg, + const char *name) +{ + int len, target_len = strlen(name); + const struct btf_type *t; + const char *param_name; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len != target_len) + return false; + if (strcmp(param_name, name)) + return false; + + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok, - u32 kfunc_flags, + struct bpf_kfunc_arg_meta *kfunc_meta, bool processing_call) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); @@ -6241,12 +6266,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - if (is_kfunc) { + if (is_kfunc && kfunc_meta) { /* Only kfunc can be release func */ - rel = kfunc_flags & KF_RELEASE; - kptr_get = kfunc_flags & KF_KPTR_GET; - trusted_arg = kfunc_flags & KF_TRUSTED_ARGS; - sleepable = kfunc_flags & KF_SLEEPABLE; + rel = kfunc_meta->flags & KF_RELEASE; + kptr_get = kfunc_meta->flags & KF_KPTR_GET; + trusted_arg = kfunc_meta->flags & KF_TRUSTED_ARGS; + sleepable = kfunc_meta->flags & KF_SLEEPABLE; } /* check that BTF function arguments match actual types that the @@ -6259,6 +6284,38 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, t = btf_type_skip_modifiers(btf, args[i].type, NULL); if (btf_type_is_scalar(t)) { + if (is_kfunc && kfunc_meta) { + bool is_buf_size = false; + + /* check for any const scalar parameter of name "rdonly_buf_size" + * or "rdwr_buf_size" + */ + if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg, + "rdonly_buf_size")) { + kfunc_meta->r0_rdonly = true; + is_buf_size = true; + } else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg, + "rdwr_buf_size")) + is_buf_size = true; + + if (is_buf_size) { + if (kfunc_meta->r0_size) { + bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc"); + return -EINVAL; + } + + if (!tnum_is_const(reg->var_off)) { + bpf_log(log, "R%d is not a const\n", regno); + return -EINVAL; + } + + kfunc_meta->r0_size = reg->var_off.value; + ret = mark_chain_precision(env, regno); + if (ret) + return ret; + } + } + if (reg->type == SCALAR_VALUE) continue; bpf_log(log, "R%d is not a scalar\n", regno); @@ -6289,6 +6346,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (ret < 0) return ret; + if (is_kfunc && reg->ref_obj_id) { + /* Ensure only one argument is referenced PTR_TO_BTF_ID */ + if (ref_obj_id) { + bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, ref_obj_id); + return -EFAULT; + } + ref_regno = regno; + ref_obj_id = reg->ref_obj_id; + } + /* kptr_get is only true for kfunc */ if (i == 0 && kptr_get) { struct bpf_map_value_off_desc *off_desc; @@ -6361,16 +6429,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - /* Ensure only one argument is referenced PTR_TO_BTF_ID */ - if (reg->ref_obj_id) { - if (ref_obj_id) { - bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, ref_obj_id); - return -EFAULT; - } - ref_regno = regno; - ref_obj_id = reg->ref_obj_id; - } } else { reg_btf = btf_vmlinux; reg_ref_id = *reg2btf_ids[base_type(reg->type)]; @@ -6461,6 +6519,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + if (kfunc_meta && ref_obj_id) + kfunc_meta->ref_obj_id = ref_obj_id; + /* returns argument register number > 0 in case of reference release kfunc */ return rel ? ref_regno : 0; } @@ -6492,7 +6553,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, false); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6535,7 +6596,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, true); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6549,9 +6610,9 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - u32 kfunc_flags) + struct bpf_kfunc_arg_meta *meta) { - return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags, true); + return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true); } /* Convert BTF of a function into bpf_reg_state if possible diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3cfe60206de6..f3344a86d88d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2908,7 +2908,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, return 0; } -static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +int mark_chain_precision(struct bpf_verifier_env *env, int regno) { return __mark_chain_precision(env, regno, -1); } @@ -7595,6 +7595,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); + struct bpf_kfunc_arg_meta meta = { 0 }; const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; int err, insn_idx = *insn_idx_p; @@ -7629,8 +7630,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, acq = *kfunc_flags & KF_ACQUIRE; + meta.flags = *kfunc_flags; + /* Check the arguments */ - err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags); + err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta); if (err < 0) return err; /* In case of release function, we get register number of refcounted @@ -7651,7 +7654,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); - if (acq && !btf_type_is_ptr(t)) { + if (acq && !btf_type_is_struct_ptr(desc_btf, t)) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; } @@ -7663,17 +7666,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); if (!btf_type_is_struct(ptr_type)) { - ptr_type_name = btf_name_by_offset(desc_btf, - ptr_type->name_off); - verbose(env, "kernel function %s returns pointer type %s %s is not supported\n", - func_name, btf_type_str(ptr_type), - ptr_type_name); - return -EINVAL; + if (!meta.r0_size) { + ptr_type_name = btf_name_by_offset(desc_btf, + ptr_type->name_off); + verbose(env, + "kernel function %s returns pointer type %s %s is not supported\n", + func_name, + btf_type_str(ptr_type), + ptr_type_name); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM; + regs[BPF_REG_0].mem_size = meta.r0_size; + + if (meta.r0_rdonly) + regs[BPF_REG_0].type |= MEM_RDONLY; + + /* Ensures we don't access the memory after a release_reference() */ + if (meta.ref_obj_id) + regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = PTR_TO_BTF_ID; + regs[BPF_REG_0].btf_id = ptr_type_id; } - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; if (*kfunc_flags & KF_RET_NULL) { regs[BPF_REG_0].type |= PTR_MAYBE_NULL; /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ From 22ed8d5a46520ef0f060e7c0ee91f1cc6f684400 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:13:03 +0200 Subject: [PATCH 016/143] selftests/bpf: Add tests for kfunc returning a memory pointer We add 2 new kfuncs that are following the RET_PTR_TO_MEM capability from the previous commit. Then we test them in selftests: the first tests are testing valid case, and are not failing, and the later ones are actually preventing the program to be loaded because they are wrong. To work around that, we mark the failing ones as not autoloaded (with SEC("?tc")), and we manually enable them one by one, ensuring the verifier rejects them. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220906151303.2780789-8-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 36 ++++++ .../selftests/bpf/prog_tests/kfunc_call.c | 7 + .../selftests/bpf/progs/kfunc_call_fail.c | 121 ++++++++++++++++++ .../selftests/bpf/progs/kfunc_call_test.c | 33 +++++ 4 files changed, 197 insertions(+) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index f16baf977a21..13d578ce2a09 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -606,6 +606,38 @@ noinline void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p) WARN_ON_ONCE(1); } +static int *__bpf_kfunc_call_test_get_mem(struct prog_test_ref_kfunc *p, const int size) +{ + if (size > 2 * sizeof(int)) + return NULL; + + return (int *)p; +} + +noinline int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) +{ + return __bpf_kfunc_call_test_get_mem(p, rdwr_buf_size); +} + +noinline int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) +{ + return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size); +} + +/* the next 2 ones can't be really used for testing expect to ensure + * that the verifier rejects the call. + * Acquire functions must return struct pointers, so these ones are + * failing. + */ +noinline int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) +{ + return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size); +} + +noinline void bpf_kfunc_call_int_mem_release(int *p) +{ +} + noinline struct prog_test_ref_kfunc * bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **pp, int a, int b) { @@ -712,6 +744,10 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb1_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdwr_mem, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdonly_mem, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_acq_rdonly_mem, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_kfunc_call_int_mem_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_kptr_get, KF_ACQUIRE | KF_RET_NULL | KF_KPTR_GET) BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass_ctx) BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass1) diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index d5881c3331a8..5af1ee8f0e6e 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -50,6 +50,7 @@ struct kfunc_test_params { #define SYSCALL_TEST(name, retval) __BPF_TEST_SUCCESS(name, retval, syscall_test) #define SYSCALL_NULL_CTX_TEST(name, retval) __BPF_TEST_SUCCESS(name, retval, syscall_null_ctx_test) +#define TC_FAIL(name, retval, error_msg) __BPF_TEST_FAIL(name, retval, tc_test, error_msg) #define SYSCALL_NULL_CTX_FAIL(name, retval, error_msg) \ __BPF_TEST_FAIL(name, retval, syscall_null_ctx_test, error_msg) @@ -62,11 +63,17 @@ static struct kfunc_test_params kfunc_tests[] = { */ SYSCALL_NULL_CTX_FAIL(kfunc_syscall_test_fail, -EINVAL, "processed 4 insns"), SYSCALL_NULL_CTX_FAIL(kfunc_syscall_test_null_fail, -EINVAL, "processed 4 insns"), + TC_FAIL(kfunc_call_test_get_mem_fail_rdonly, 0, "R0 cannot write into rdonly_mem"), + TC_FAIL(kfunc_call_test_get_mem_fail_use_after_free, 0, "invalid mem access 'scalar'"), + TC_FAIL(kfunc_call_test_get_mem_fail_oob, 0, "min value is outside of the allowed memory range"), + TC_FAIL(kfunc_call_test_get_mem_fail_not_const, 0, "is not a const"), + TC_FAIL(kfunc_call_test_mem_acquire_fail, 0, "acquire kernel function does not return PTR_TO_BTF_ID"), /* success cases */ TC_TEST(kfunc_call_test1, 12), TC_TEST(kfunc_call_test2, 3), TC_TEST(kfunc_call_test_ref_btf_id, 0), + TC_TEST(kfunc_call_test_get_mem, 42), SYSCALL_TEST(kfunc_syscall_test, 0), SYSCALL_NULL_CTX_TEST(kfunc_syscall_test_null, 0), }; diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c index 4168027f2ab1..b98313d391c6 100644 --- a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c +++ b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c @@ -3,7 +3,13 @@ #include #include +extern struct prog_test_ref_kfunc *bpf_kfunc_call_test_acquire(unsigned long *sp) __ksym; +extern void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym; extern void bpf_kfunc_call_test_mem_len_pass1(void *mem, int len) __ksym; +extern int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) __ksym; +extern int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym; +extern int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym; +extern void bpf_kfunc_call_int_mem_release(int *p) __ksym; struct syscall_test_args { __u8 data[16]; @@ -36,4 +42,119 @@ int kfunc_syscall_test_null_fail(struct syscall_test_args *args) return 0; } +SEC("?tc") +int kfunc_call_test_get_mem_fail_rdonly(struct __sk_buff *skb) +{ + struct prog_test_ref_kfunc *pt; + unsigned long s = 0; + int *p = NULL; + int ret = 0; + + pt = bpf_kfunc_call_test_acquire(&s); + if (pt) { + p = bpf_kfunc_call_test_get_rdonly_mem(pt, 2 * sizeof(int)); + if (p) + p[0] = 42; /* this is a read-only buffer, so -EACCES */ + else + ret = -1; + + bpf_kfunc_call_test_release(pt); + } + return ret; +} + +SEC("?tc") +int kfunc_call_test_get_mem_fail_use_after_free(struct __sk_buff *skb) +{ + struct prog_test_ref_kfunc *pt; + unsigned long s = 0; + int *p = NULL; + int ret = 0; + + pt = bpf_kfunc_call_test_acquire(&s); + if (pt) { + p = bpf_kfunc_call_test_get_rdwr_mem(pt, 2 * sizeof(int)); + if (p) { + p[0] = 42; + ret = p[1]; /* 108 */ + } else { + ret = -1; + } + + bpf_kfunc_call_test_release(pt); + } + if (p) + ret = p[0]; /* p is not valid anymore */ + + return ret; +} + +SEC("?tc") +int kfunc_call_test_get_mem_fail_oob(struct __sk_buff *skb) +{ + struct prog_test_ref_kfunc *pt; + unsigned long s = 0; + int *p = NULL; + int ret = 0; + + pt = bpf_kfunc_call_test_acquire(&s); + if (pt) { + p = bpf_kfunc_call_test_get_rdonly_mem(pt, 2 * sizeof(int)); + if (p) + ret = p[2 * sizeof(int)]; /* oob access, so -EACCES */ + else + ret = -1; + + bpf_kfunc_call_test_release(pt); + } + return ret; +} + +int not_const_size = 2 * sizeof(int); + +SEC("?tc") +int kfunc_call_test_get_mem_fail_not_const(struct __sk_buff *skb) +{ + struct prog_test_ref_kfunc *pt; + unsigned long s = 0; + int *p = NULL; + int ret = 0; + + pt = bpf_kfunc_call_test_acquire(&s); + if (pt) { + p = bpf_kfunc_call_test_get_rdonly_mem(pt, not_const_size); /* non const size, -EINVAL */ + if (p) + ret = p[0]; + else + ret = -1; + + bpf_kfunc_call_test_release(pt); + } + return ret; +} + +SEC("?tc") +int kfunc_call_test_mem_acquire_fail(struct __sk_buff *skb) +{ + struct prog_test_ref_kfunc *pt; + unsigned long s = 0; + int *p = NULL; + int ret = 0; + + pt = bpf_kfunc_call_test_acquire(&s); + if (pt) { + /* we are failing on this one, because we are not acquiring a PTR_TO_BTF_ID (a struct ptr) */ + p = bpf_kfunc_call_test_acq_rdonly_mem(pt, 2 * sizeof(int)); + if (p) + ret = p[0]; + else + ret = -1; + + bpf_kfunc_call_int_mem_release(p); + + bpf_kfunc_call_test_release(pt); + } + return ret; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c index 9e1914916f1d..f636e50be259 100644 --- a/tools/testing/selftests/bpf/progs/kfunc_call_test.c +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c @@ -14,6 +14,8 @@ extern void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) __ksym; extern void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym; extern void bpf_kfunc_call_test_mem_len_pass1(void *mem, int len) __ksym; extern void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym; +extern int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) __ksym; +extern int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym; SEC("tc") int kfunc_call_test2(struct __sk_buff *skb) @@ -130,4 +132,35 @@ int kfunc_syscall_test_null(struct syscall_test_args *args) return 0; } +SEC("tc") +int kfunc_call_test_get_mem(struct __sk_buff *skb) +{ + struct prog_test_ref_kfunc *pt; + unsigned long s = 0; + int *p = NULL; + int ret = 0; + + pt = bpf_kfunc_call_test_acquire(&s); + if (pt) { + p = bpf_kfunc_call_test_get_rdwr_mem(pt, 2 * sizeof(int)); + if (p) { + p[0] = 42; + ret = p[1]; /* 108 */ + } else { + ret = -1; + } + + if (ret >= 0) { + p = bpf_kfunc_call_test_get_rdonly_mem(pt, 2 * sizeof(int)); + if (p) + ret = p[0]; /* 42 */ + else + ret = -1; + } + + bpf_kfunc_call_test_release(pt); + } + return ret; +} + char _license[] SEC("license") = "GPL"; From 9fad7fe5b29803584c7f17a2abe6c2936fec6828 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Wed, 7 Sep 2022 16:24:20 +0100 Subject: [PATCH 017/143] bpf: Fix resetting logic for unreferenced kptrs Sparse reported a warning at bpf_map_free_kptrs() "warning: Using plain integer as NULL pointer" During the process of fixing this warning, it was discovered that the current code erroneously writes to the pointer variable instead of deferencing and writing to the actual kptr. Hence, Sparse tool accidentally helped to uncover this problem. Fix this by doing WRITE_ONCE(*p, 0) instead of WRITE_ONCE(p, 0). Note that the effect of this bug is that unreferenced kptrs will not be cleared during check_and_free_fields. It is not a problem if the clearing is not done during map_free stage, as there is nothing to free for them. Fixes: 14a324f6a67e ("bpf: Wire up freeing of referenced kptr") Signed-off-by: Jules Irenge Link: https://lore.kernel.org/r/Yxi3pJaK6UDjVJSy@playground Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4fb08c43420d..d35a6aa3aa96 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -598,7 +598,7 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) if (off_desc->type == BPF_KPTR_UNREF) { u64 *p = (u64 *)btf_id_ptr; - WRITE_ONCE(p, 0); + WRITE_ONCE(*p, 0); continue; } old_ptr = xchg(btf_id_ptr, 0); From 448325199f574d33824dbf9121efb03558412966 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:14 +0200 Subject: [PATCH 018/143] bpf: Add copy_map_value_long to copy to remote percpu memory bpf_long_memcpy is used while copying to remote percpu regions from BPF syscall and helpers, so that the copy is atomic at word size granularity. This might not be possible when you copy from map value hosting kptrs from or to percpu maps, as the alignment or size in disjoint regions may not be multiple of word size. Hence, to avoid complicating the copy loop, we only use bpf_long_memcpy when special fields are not present, otherwise use normal memcpy to copy the disjoint regions. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 52 ++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 79883f883ff3..6a73e94821c4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -280,14 +280,33 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) } } -/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ -static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and + * forced to use 'long' read/writes to try to atomically copy long counters. + * Best-effort only. No barriers here, since it _will_ race with concurrent + * updates from BPF programs. Called from bpf syscall and mostly used with + * size 8 or 16 bytes, so ask compiler to inline it. + */ +static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) +{ + const long *lsrc = src; + long *ldst = dst; + + size /= sizeof(long); + while (size--) + *ldst++ = *lsrc++; +} + +/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ +static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy) { u32 curr_off = 0; int i; if (likely(!map->off_arr)) { - memcpy(dst, src, map->value_size); + if (long_memcpy) + bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); + else + memcpy(dst, src, map->value_size); return; } @@ -299,6 +318,17 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) } memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } + +static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +{ + __copy_map_value(map, dst, src, false); +} + +static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) +{ + __copy_map_value(map, dst, src, true); +} + void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); @@ -1827,22 +1857,6 @@ int bpf_get_file_flag(int flags); int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); -/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and - * forced to use 'long' read/writes to try to atomically copy long counters. - * Best-effort only. No barriers here, since it _will_ race with concurrent - * updates from BPF programs. Called from bpf syscall and mostly used with - * size 8 or 16 bytes, so ask compiler to inline it. - */ -static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) -{ - const long *lsrc = src; - long *ldst = dst; - - size /= sizeof(long); - while (size--) - *ldst++ = *lsrc++; -} - /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr); From 6df4ea1ff0ff70798ff1e7eed79f98ccb7b5b0a2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:15 +0200 Subject: [PATCH 019/143] bpf: Support kptrs in percpu arraymap Enable support for kptrs in percpu BPF arraymap by wiring up the freeing of these kptrs from percpu map elements. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 33 ++++++++++++++++++++++++--------- kernel/bpf/syscall.c | 3 ++- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 624527401d4d..832b2659e96e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -279,7 +279,8 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value + off); off += size; } rcu_read_unlock(); @@ -338,8 +339,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), - value, map->value_size); + val = this_cpu_ptr(array->pptrs[index & array->index_mask]); + copy_map_value(map, val, value); + check_and_free_fields(array, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -383,7 +385,8 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); + check_and_free_fields(array, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); @@ -421,8 +424,20 @@ static void array_map_free(struct bpf_map *map) int i; if (map_value_has_kptrs(map)) { - for (i = 0; i < array->map.max_entries; i++) - bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + for (i = 0; i < array->map.max_entries; i++) { + void __percpu *pptr = array->pptrs[i & array->index_mask]; + int cpu; + + for_each_possible_cpu(cpu) { + bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu)); + cond_resched(); + } + } + } else { + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); + } bpf_map_free_kptr_off_tab(map); } @@ -608,9 +623,9 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) pptr = v; size = array->elem_size; for_each_possible_cpu(cpu) { - bpf_long_memcpy(info->percpu_value_buf + off, - per_cpu_ptr(pptr, cpu), - size); + copy_map_value_long(map, info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, info->percpu_value_buf + off); off += size; } ctx.value = info->percpu_value_buf; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d35a6aa3aa96..69be1c612daa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1049,7 +1049,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, } if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) { + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { ret = -EOPNOTSUPP; goto free_map_tab; } From cc48755808c646666436745b35629c3f0d05e165 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:16 +0200 Subject: [PATCH 020/143] bpf: Add zero_map_value to zero map value with special fields We need this helper to skip over special fields (bpf_spin_lock, bpf_timer, kptrs) while zeroing a map value. Use the same logic as copy_map_value but memset instead of memcpy. Currently, the code zeroing map value memory does not have to deal with special fields, hence this is a prerequisite for introducing such support. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6a73e94821c4..48ae05099f36 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -329,6 +329,25 @@ static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src __copy_map_value(map, dst, src, true); } +static inline void zero_map_value(struct bpf_map *map, void *dst) +{ + u32 curr_off = 0; + int i; + + if (likely(!map->off_arr)) { + memset(dst, 0, map->value_size); + return; + } + + for (i = 0; i < map->off_arr->cnt; i++) { + u32 next_off = map->off_arr->field_off[i]; + + memset(dst + curr_off, 0, next_off - curr_off); + curr_off += map->off_arr->field_sz[i]; + } + memset(dst + curr_off, 0, map->value_size - curr_off); +} + void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); From b239da34203f49c40b5d656220c39647c3ff0b3c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:28 +0200 Subject: [PATCH 021/143] bpf: Add helper macro bpf_for_each_reg_in_vstate For a lot of use cases in future patches, we will want to modify the state of registers part of some same 'group' (e.g. same ref_obj_id). It won't just be limited to releasing reference state, but setting a type flag dynamically based on certain actions, etc. Hence, we need a way to easily pass a callback to the function that iterates over all registers in current bpf_verifier_state in all frames upto (and including) the curframe. While in C++ we would be able to easily use a lambda to pass state and the callback together, sadly we aren't using C++ in the kernel. The next best thing to avoid defining a function for each case seems like statement expressions in GNU C. The kernel already uses them heavily, hence they can passed to the macro in the style of a lambda. The statement expression will then be substituted in the for loop bodies. Variables __state and __reg are set to current bpf_func_state and reg for each invocation of the expression inside the passed in verifier state. Then, convert mark_ptr_or_null_regs, clear_all_pkt_pointers, release_reference, find_good_pkt_pointers, find_equal_scalars to use bpf_for_each_reg_in_vstate. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-16-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 21 ++++++ kernel/bpf/verifier.c | 137 ++++++++--------------------------- 2 files changed, 50 insertions(+), 108 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8fbc1d05281e..b49a349cc6ae 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -348,6 +348,27 @@ struct bpf_verifier_state { iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame)) +/* Invoke __expr over regsiters in __vst, setting __state and __reg */ +#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ + ({ \ + struct bpf_verifier_state *___vstate = __vst; \ + int ___i, ___j; \ + for (___i = 0; ___i <= ___vstate->curframe; ___i++) { \ + struct bpf_reg_state *___regs; \ + __state = ___vstate->frame[___i]; \ + ___regs = __state->regs; \ + for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ + __reg = &___regs[___j]; \ + (void)(__expr); \ + } \ + bpf_for_each_spilled_reg(___j, __state, __reg) { \ + if (!__reg) \ + continue; \ + (void)(__expr); \ + } \ + } \ + }) + /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3344a86d88d..c0f175ac187a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6513,31 +6513,15 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id) /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ -static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, - struct bpf_func_state *state) -{ - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (reg_is_pkt_pointer_any(®s[i])) - mark_reg_unknown(env, regs, i); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg_is_pkt_pointer_any(reg)) - __mark_reg_unknown(env, reg); - } -} - static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_verifier_state *vstate = env->cur_state; - int i; + struct bpf_func_state *state; + struct bpf_reg_state *reg; - for (i = 0; i <= vstate->curframe; i++) - __clear_all_pkt_pointers(env, vstate->frame[i]); + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg_is_pkt_pointer_any(reg)) + __mark_reg_unknown(env, reg); + })); } enum { @@ -6566,41 +6550,24 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range reg->range = AT_PKT_END; } -static void release_reg_references(struct bpf_verifier_env *env, - struct bpf_func_state *state, - int ref_obj_id) -{ - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].ref_obj_id == ref_obj_id) - mark_reg_unknown(env, regs, i); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->ref_obj_id == ref_obj_id) - __mark_reg_unknown(env, reg); - } -} - /* The pointer with the specified id has released its reference to kernel * resources. Identify all copies of the same pointer and clear the reference. */ static int release_reference(struct bpf_verifier_env *env, int ref_obj_id) { - struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state; + struct bpf_reg_state *reg; int err; - int i; err = release_reference_state(cur_func(env), ref_obj_id); if (err) return err; - for (i = 0; i <= vstate->curframe; i++) - release_reg_references(env, vstate->frame[i], ref_obj_id); + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->ref_obj_id == ref_obj_id) + __mark_reg_unknown(env, reg); + })); return 0; } @@ -9335,34 +9302,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void __find_good_pkt_pointers(struct bpf_func_state *state, - struct bpf_reg_state *dst_reg, - enum bpf_reg_type type, int new_range) -{ - struct bpf_reg_state *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) { - reg = &state->regs[i]; - if (reg->type == type && reg->id == dst_reg->id) - /* keep the maximum range already checked */ - reg->range = max(reg->range, new_range); - } - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } -} - static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { - int new_range, i; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int new_range; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -9427,9 +9374,11 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ - for (i = 0; i <= vstate->curframe; i++) - __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, - new_range); + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + if (reg->type == type && reg->id == dst_reg->id) + /* keep the maximum range already checked */ + reg->range = max(reg->range, new_range); + })); } static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) @@ -9918,7 +9867,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset - * in release_reg_references(). + * in release_reference(). * * reg->id is still used by spin_lock ptr. Other * than spin_lock ptr type, reg->id can be reset. @@ -9928,22 +9877,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } -static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, - bool is_null) -{ - struct bpf_reg_state *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } -} - /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ @@ -9951,10 +9884,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs = state->regs, *reg; u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - int i; if (ref_obj_id && ref_obj_id == id && is_null) /* regs[regno] is in the " == NULL" branch. @@ -9963,8 +9895,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, */ WARN_ON_ONCE(release_reference_state(state, id)); - for (i = 0; i <= vstate->curframe; i++) - __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + mark_ptr_or_null_reg(state, reg, id, is_null); + })); } static bool try_match_pkt_pointers(const struct bpf_insn *insn, @@ -10077,23 +10010,11 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, { struct bpf_func_state *state; struct bpf_reg_state *reg; - int i, j; - for (i = 0; i <= vstate->curframe; i++) { - state = vstate->frame[i]; - for (j = 0; j < MAX_BPF_REG; j++) { - reg = &state->regs[j]; - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; - } - - bpf_for_each_spilled_reg(j, state, reg) { - if (!reg) - continue; - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; - } - } + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + })); } static int check_cond_jmp_op(struct bpf_verifier_env *env, From 665f5d3577ef43e929d59cf39683037887c351bf Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2022 16:15:11 +0200 Subject: [PATCH 022/143] libbpf: Remove gcc support for bpf_tail_call_static for now This reverts commit 14e5ce79943a ("libbpf: Add GCC support for bpf_tail_call_static"). Reason is that gcc invented their own BPF asm which is not conform with LLVM one, and going forward this would be more painful to maintain here and in other areas of the library. Thus remove it; ask to gcc folks is to align with LLVM one to use exact same syntax. Fixes: 14e5ce79943a ("libbpf: Add GCC support for bpf_tail_call_static") Signed-off-by: Daniel Borkmann Cc: James Hilliard Cc: Jose E. Marchesi --- tools/lib/bpf/bpf_helpers.h | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 867b734839dd..7349b16b8e2f 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -131,7 +131,7 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ -#if (!defined(__clang__) || __clang_major__ >= 8) && defined(__bpf__) +#if __clang_major__ >= 8 && defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -139,8 +139,8 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) __bpf_unreachable(); /* - * Provide a hard guarantee that the compiler won't optimize setting r2 - * (map pointer) and r3 (constant map index) from _different paths_ ending + * Provide a hard guarantee that LLVM won't optimize setting r2 (map + * pointer) and r3 (constant map index) from _different paths_ ending * up at the _same_ call insn as otherwise we won't be able to use the * jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel * given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key @@ -148,19 +148,12 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) * * Note on clobber list: we need to stay in-line with BPF calling * convention, so even if we don't end up using r0, r4, r5, we need - * to mark them as clobber so that the compiler doesn't end up using - * them before / after the call. + * to mark them as clobber so that LLVM doesn't end up using them + * before / after the call. */ - asm volatile( -#ifdef __clang__ - "r1 = %[ctx]\n\t" + asm volatile("r1 = %[ctx]\n\t" "r2 = %[map]\n\t" "r3 = %[slot]\n\t" -#else - "mov %%r1,%[ctx]\n\t" - "mov %%r2,%[map]\n\t" - "mov %%r3,%[slot]\n\t" -#endif "call 12" :: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot) : "r0", "r1", "r2", "r3", "r4", "r5"); From 0ffe2412531e95a309d7f0bfe985fc4ca4d39de8 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Fri, 9 Sep 2022 00:49:39 +0000 Subject: [PATCH 023/143] bpf: Invoke cgroup/connect{4,6} programs for unprivileged ICMP ping Usually when a TCP/UDP connection is initiated, we can bind the socket to a specific IP attached to an interface in a cgroup/connect hook. But for pings, this is impossible, as the hook is not being called. This adds the hook invocation to unprivileged ICMP ping (i.e. ping sockets created with SOCK_DGRAM IPPROTO_ICMP(V6) as opposed to SOCK_RAW. Logic is mirrored from UDP sockets where the hook is invoked during pre_connect, after a check for suficiently sized addr_len. Signed-off-by: YiFei Zhu Link: https://lore.kernel.org/r/5764914c252fad4cd134fb6664c6ede95f409412.1662682323.git.zhuyifei@google.com Signed-off-by: Martin KaFai Lau --- net/ipv4/ping.c | 15 +++++++++++++++ net/ipv6/ping.c | 16 ++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b83c2bd9d722..517042caf6dc 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -295,6 +296,19 @@ void ping_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(ping_close); +static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from __ip4_datagram_connect() and + * intended to prevent BPF program called below from accessing bytes + * that are out of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); +} + /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, struct sockaddr *uaddr, int addr_len) @@ -1009,6 +1023,7 @@ struct proto ping_prot = { .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, + .pre_connect = ping_pre_connect, .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .setsockopt = ip_setsockopt, diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 91b840514656..5f2ef8493714 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -20,6 +20,7 @@ #include #include #include +#include #include static void ping_v6_destroy(struct sock *sk) @@ -49,6 +50,20 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, return 0; } +static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from __ip6_datagram_connect() and + * intended to prevent BPF program called below from accessing + * bytes that are out of the bound specified by user in addr_len. + */ + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); +} + static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); @@ -191,6 +206,7 @@ struct proto pingv6_prot = { .init = ping_init_sock, .close = ping_close, .destroy = ping_v6_destroy, + .pre_connect = ping_v6_pre_connect, .connect = ip6_datagram_connect_v6_only, .disconnect = __udp_disconnect, .setsockopt = ipv6_setsockopt, From e42921c3c346b1b49068af3f3881322081e1dddd Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Fri, 9 Sep 2022 00:49:40 +0000 Subject: [PATCH 024/143] selftests/bpf: Deduplicate write_sysctl() to test_progs.c This helper is needed in multiple tests. Instead of copying it over and over, better to deduplicate this helper to test_progs.c. test_progs.c is chosen over testing_helpers.c because of this helper's use of CHECK / ASSERT_*, and the CHECK was modified to use ASSERT_* so it does not rely on a duration variable. Suggested-by: Martin KaFai Lau Signed-off-by: YiFei Zhu Link: https://lore.kernel.org/r/9b4fc9a27bd52f771b657b4c4090fc8d61f3a6b5.1662682323.git.zhuyifei@google.com Signed-off-by: Martin KaFai Lau --- .../bpf/prog_tests/btf_skc_cls_ingress.c | 20 ------------------- .../bpf/prog_tests/tcp_hdr_options.c | 20 ------------------- tools/testing/selftests/bpf/test_progs.c | 17 ++++++++++++++++ tools/testing/selftests/bpf/test_progs.h | 1 + 4 files changed, 18 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c index 664ffc0364f4..7a277035c275 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c @@ -22,26 +22,6 @@ static __u32 duration; #define PROG_PIN_FILE "/sys/fs/bpf/btf_skc_cls_ingress" -static int write_sysctl(const char *sysctl, const char *value) -{ - int fd, err, len; - - fd = open(sysctl, O_WRONLY); - if (CHECK(fd == -1, "open sysctl", "open(%s): %s (%d)\n", - sysctl, strerror(errno), errno)) - return -1; - - len = strlen(value); - err = write(fd, value, len); - close(fd); - if (CHECK(err != len, "write sysctl", - "write(%s, %s, %d): err:%d %s (%d)\n", - sysctl, value, len, err, strerror(errno), errno)) - return -1; - - return 0; -} - static int prepare_netns(void) { if (CHECK(unshare(CLONE_NEWNET), "create netns", diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c index 1fa772079967..f24436d33cd6 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c @@ -54,26 +54,6 @@ static int create_netns(void) return 0; } -static int write_sysctl(const char *sysctl, const char *value) -{ - int fd, err, len; - - fd = open(sysctl, O_WRONLY); - if (CHECK(fd == -1, "open sysctl", "open(%s): %s (%d)\n", - sysctl, strerror(errno), errno)) - return -1; - - len = strlen(value); - err = write(fd, value, len); - close(fd); - if (CHECK(err != len, "write sysctl", - "write(%s, %s): err:%d %s (%d)\n", - sysctl, value, err, strerror(errno), errno)) - return -1; - - return 0; -} - static void print_hdr_stg(const struct hdr_stg *hdr_stg, const char *prefix) { fprintf(stderr, "%s{active:%u, resend_syn:%u, syncookie:%u, fastopen:%u}\n", diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 3561c97701f2..0e9a47f97890 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -943,6 +943,23 @@ int trigger_module_test_write(int write_sz) return 0; } +int write_sysctl(const char *sysctl, const char *value) +{ + int fd, err, len; + + fd = open(sysctl, O_WRONLY); + if (!ASSERT_NEQ(fd, -1, "open sysctl")) + return -1; + + len = strlen(value); + err = write(fd, value, len); + close(fd); + if (!ASSERT_EQ(err, len, "write sysctl")) + return -1; + + return 0; +} + #define MAX_BACKTRACE_SZ 128 void crash_handler(int signum) { diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 5fe1365c2bb1..b090996daee5 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -384,6 +384,7 @@ int extract_build_id(char *build_id, size_t size); int kern_sync_rcu(void); int trigger_module_test_read(int read_sz); int trigger_module_test_write(int write_sz); +int write_sysctl(const char *sysctl, const char *value); #ifdef __x86_64__ #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep" From 58c449a96946929467b537589c8a23f11e04af39 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Fri, 9 Sep 2022 00:49:41 +0000 Subject: [PATCH 025/143] selftests/bpf: Ensure cgroup/connect{4,6} programs can bind unpriv ICMP ping This tests that when an unprivileged ICMP ping socket connects, the hooks are actually invoked. We also ensure that if the hook does not call bpf_bind(), the bound address is unmodified, and if the hook calls bpf_bind(), the bound address is exactly what we provided to the helper. A new netns is used to enable ping_group_range in the test without affecting ouside of the test, because by default, not even root is permitted to use unprivileged ICMP ping... Signed-off-by: YiFei Zhu Link: https://lore.kernel.org/r/086b227c1b97f4e94193e58aae7576d0261b68a4.1662682323.git.zhuyifei@google.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/connect_ping.c | 178 ++++++++++++++++++ .../selftests/bpf/progs/connect_ping.c | 53 ++++++ 2 files changed, 231 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/connect_ping.c create mode 100644 tools/testing/selftests/bpf/progs/connect_ping.c diff --git a/tools/testing/selftests/bpf/prog_tests/connect_ping.c b/tools/testing/selftests/bpf/prog_tests/connect_ping.c new file mode 100644 index 000000000000..289218c2216c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/connect_ping.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright 2022 Google LLC. + */ + +#define _GNU_SOURCE +#include + +#include "test_progs.h" +#include "cgroup_helpers.h" +#include "network_helpers.h" + +#include "connect_ping.skel.h" + +/* 2001:db8::1 */ +#define BINDADDR_V6 { { { 0x20,0x01,0x0d,0xb8,0,0,0,0,0,0,0,0,0,0,0,1 } } } +static const struct in6_addr bindaddr_v6 = BINDADDR_V6; + +static void subtest(int cgroup_fd, struct connect_ping *skel, + int family, int do_bind) +{ + struct sockaddr_in sa4 = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; + struct sockaddr_in6 sa6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; + struct sockaddr *sa; + socklen_t sa_len; + int protocol; + int sock_fd; + + switch (family) { + case AF_INET: + sa = (struct sockaddr *)&sa4; + sa_len = sizeof(sa4); + protocol = IPPROTO_ICMP; + break; + case AF_INET6: + sa = (struct sockaddr *)&sa6; + sa_len = sizeof(sa6); + protocol = IPPROTO_ICMPV6; + break; + } + + memset(skel->bss, 0, sizeof(*skel->bss)); + skel->bss->do_bind = do_bind; + + sock_fd = socket(family, SOCK_DGRAM, protocol); + if (!ASSERT_GE(sock_fd, 0, "sock-create")) + return; + + if (!ASSERT_OK(connect(sock_fd, sa, sa_len), "connect")) + goto close_sock; + + if (!ASSERT_EQ(skel->bss->invocations_v4, family == AF_INET ? 1 : 0, + "invocations_v4")) + goto close_sock; + if (!ASSERT_EQ(skel->bss->invocations_v6, family == AF_INET6 ? 1 : 0, + "invocations_v6")) + goto close_sock; + if (!ASSERT_EQ(skel->bss->has_error, 0, "has_error")) + goto close_sock; + + if (!ASSERT_OK(getsockname(sock_fd, sa, &sa_len), + "getsockname")) + goto close_sock; + + switch (family) { + case AF_INET: + if (!ASSERT_EQ(sa4.sin_family, family, "sin_family")) + goto close_sock; + if (!ASSERT_EQ(sa4.sin_addr.s_addr, + htonl(do_bind ? 0x01010101 : INADDR_LOOPBACK), + "sin_addr")) + goto close_sock; + break; + case AF_INET6: + if (!ASSERT_EQ(sa6.sin6_family, AF_INET6, "sin6_family")) + goto close_sock; + if (!ASSERT_EQ(memcmp(&sa6.sin6_addr, + do_bind ? &bindaddr_v6 : &in6addr_loopback, + sizeof(sa6.sin6_addr)), + 0, "sin6_addr")) + goto close_sock; + break; + } + +close_sock: + close(sock_fd); +} + +void test_connect_ping(void) +{ + struct connect_ping *skel; + int cgroup_fd; + + if (!ASSERT_OK(unshare(CLONE_NEWNET | CLONE_NEWNS), "unshare")) + return; + + /* overmount sysfs, and making original sysfs private so overmount + * does not propagate to other mntns. + */ + if (!ASSERT_OK(mount("none", "/sys", NULL, MS_PRIVATE, NULL), + "remount-private-sys")) + return; + if (!ASSERT_OK(mount("sysfs", "/sys", "sysfs", 0, NULL), + "mount-sys")) + return; + if (!ASSERT_OK(mount("bpffs", "/sys/fs/bpf", "bpf", 0, NULL), + "mount-bpf")) + goto clean_mount; + + if (!ASSERT_OK(system("ip link set dev lo up"), "lo-up")) + goto clean_mount; + if (!ASSERT_OK(system("ip addr add 1.1.1.1 dev lo"), "lo-addr-v4")) + goto clean_mount; + if (!ASSERT_OK(system("ip -6 addr add 2001:db8::1 dev lo"), "lo-addr-v6")) + goto clean_mount; + if (write_sysctl("/proc/sys/net/ipv4/ping_group_range", "0 0")) + goto clean_mount; + + cgroup_fd = test__join_cgroup("/connect_ping"); + if (!ASSERT_GE(cgroup_fd, 0, "cg-create")) + goto clean_mount; + + skel = connect_ping__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel-load")) + goto close_cgroup; + skel->links.connect_v4_prog = + bpf_program__attach_cgroup(skel->progs.connect_v4_prog, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.connect_v4_prog, "cg-attach-v4")) + goto skel_destroy; + skel->links.connect_v6_prog = + bpf_program__attach_cgroup(skel->progs.connect_v6_prog, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.connect_v6_prog, "cg-attach-v6")) + goto skel_destroy; + + /* Connect a v4 ping socket to localhost, assert that only v4 is called, + * and called exactly once, and that the socket's bound address is + * original loopback address. + */ + if (test__start_subtest("ipv4")) + subtest(cgroup_fd, skel, AF_INET, 0); + + /* Connect a v4 ping socket to localhost, assert that only v4 is called, + * and called exactly once, and that the socket's bound address is + * address we explicitly bound. + */ + if (test__start_subtest("ipv4-bind")) + subtest(cgroup_fd, skel, AF_INET, 1); + + /* Connect a v6 ping socket to localhost, assert that only v6 is called, + * and called exactly once, and that the socket's bound address is + * original loopback address. + */ + if (test__start_subtest("ipv6")) + subtest(cgroup_fd, skel, AF_INET6, 0); + + /* Connect a v6 ping socket to localhost, assert that only v6 is called, + * and called exactly once, and that the socket's bound address is + * address we explicitly bound. + */ + if (test__start_subtest("ipv6-bind")) + subtest(cgroup_fd, skel, AF_INET6, 1); + +skel_destroy: + connect_ping__destroy(skel); + +close_cgroup: + close(cgroup_fd); + +clean_mount: + umount2("/sys", MNT_DETACH); +} diff --git a/tools/testing/selftests/bpf/progs/connect_ping.c b/tools/testing/selftests/bpf/progs/connect_ping.c new file mode 100644 index 000000000000..60178192b672 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_ping.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright 2022 Google LLC. + */ + +#include +#include +#include +#include +#include + +/* 2001:db8::1 */ +#define BINDADDR_V6 { { { 0x20,0x01,0x0d,0xb8,0,0,0,0,0,0,0,0,0,0,0,1 } } } + +__u32 do_bind = 0; +__u32 has_error = 0; +__u32 invocations_v4 = 0; +__u32 invocations_v6 = 0; + +SEC("cgroup/connect4") +int connect_v4_prog(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa = { + .sin_family = AF_INET, + .sin_addr.s_addr = bpf_htonl(0x01010101), + }; + + __sync_fetch_and_add(&invocations_v4, 1); + + if (do_bind && bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa))) + has_error = 1; + + return 1; +} + +SEC("cgroup/connect6") +int connect_v6_prog(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in6 sa = { + .sin6_family = AF_INET6, + .sin6_addr = BINDADDR_V6, + }; + + __sync_fetch_and_add(&invocations_v6, 1); + + if (do_bind && bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa))) + has_error = 1; + + return 1; +} + +char _license[] SEC("license") = "GPL"; From cf7de6a53600ea554a8358e44fbcf47b449235f9 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 11 Sep 2022 00:07:11 +0900 Subject: [PATCH 026/143] bpf: add missing percpu_counter_destroy() in htab_map_alloc() syzbot is reporting ODEBUG bug in htab_map_alloc() [1], for commit 86fe28f7692d96d2 ("bpf: Optimize element count in non-preallocated hash map.") added percpu_counter_init() to htab_map_alloc() but forgot to add percpu_counter_destroy() to the error path. Link: https://syzkaller.appspot.com/bug?extid=5d1da78b375c3b5e6c2b [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Fixes: 86fe28f7692d96d2 ("bpf: Optimize element count in non-preallocated hash map.") Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/e2e4cc0e-9d36-4ca1-9bfa-ce23e6f8310b@I-love.SAKURA.ne.jp Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0fe3f136cbbe..86aec20c22d0 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -622,6 +622,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_prealloc: prealloc_destroy(htab); free_map_locked: + if (htab->use_percpu_counter) + percpu_counter_destroy(&htab->pcount); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); From 57c92f11a215717bf90880828b7a23c736c3c0d9 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Wed, 7 Sep 2022 16:57:46 +0100 Subject: [PATCH 027/143] bpf: Simplify code by using for_each_cpu_wrap() In the percpu freelist code, it is a common pattern to iterate over the possible CPUs mask starting with the current CPU. The pattern is implemented using a hand rolled while loop with the loop variable increment being open-coded. Simplify the code by using for_each_cpu_wrap() helper to iterate over the possible cpus starting with the current CPU. As a result, some of the special-casing in the loop also gets simplified. No functional change intended. Signed-off-by: Punit Agrawal Acked-by: Song Liu Link: https://lore.kernel.org/r/20220907155746.1750329-1-punit.agrawal@bytedance.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/percpu_freelist.c | 48 ++++++++++++------------------------ 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 00b874c8e889..b6e7f5c5b9ab 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -58,23 +58,21 @@ static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, { int cpu, orig_cpu; - orig_cpu = cpu = raw_smp_processor_id(); + orig_cpu = raw_smp_processor_id(); while (1) { - struct pcpu_freelist_head *head; + for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { + struct pcpu_freelist_head *head; - head = per_cpu_ptr(s->freelist, cpu); - if (raw_spin_trylock(&head->lock)) { - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); - return; + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + pcpu_freelist_push_node(head, node); + raw_spin_unlock(&head->lock); + return; + } } - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; /* cannot lock any per cpu lock, try extralist */ - if (cpu == orig_cpu && - pcpu_freelist_try_push_extra(s, node)) + if (pcpu_freelist_try_push_extra(s, node)) return; } } @@ -125,13 +123,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - int orig_cpu, cpu; + int cpu; - orig_cpu = cpu = raw_smp_processor_id(); - while (1) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) - goto next_cpu; + continue; raw_spin_lock(&head->lock); node = head->first; if (node) { @@ -140,12 +137,6 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) return node; } raw_spin_unlock(&head->lock); -next_cpu: - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; - if (cpu == orig_cpu) - break; } /* per cpu lists are all empty, try extralist */ @@ -164,13 +155,12 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - int orig_cpu, cpu; + int cpu; - orig_cpu = cpu = raw_smp_processor_id(); - while (1) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) - goto next_cpu; + continue; if (raw_spin_trylock(&head->lock)) { node = head->first; if (node) { @@ -180,12 +170,6 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) } raw_spin_unlock(&head->lock); } -next_cpu: - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; - if (cpu == orig_cpu) - break; } /* cannot pop from per cpu lists, try extralist */ From 65269888c695cf4643c6fdb989ea28bf1623685d Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:36 -0600 Subject: [PATCH 028/143] bpf: Remove duplicate PTR_TO_BTF_ID RO check Since commit 27ae7997a661 ("bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS") there has existed bpf_verifier_ops:btf_struct_access. When btf_struct_access is _unset_ for a prog type, the verifier runs the default implementation, which is to enforce read only: if (env->ops->btf_struct_access) { [...] } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); return -EACCES; } [...] } When btf_struct_access is _set_, the expectation is that btf_struct_access has full control over accesses, including if writes are allowed. Rather than carve out an exception for each prog type that may write to BTF ptrs, delete the redundant check and give full control to btf_struct_access. Signed-off-by: Daniel Xu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/962da2bff1238746589e332ff1aecc49403cd7ce.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c0f175ac187a..c3efd461f36c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13406,9 +13406,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); env->prog->aux->num_exentries++; - } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) { - verbose(env, "Writes through BTF pointers are not allowed\n"); - return -EINVAL; } continue; default: From d4f7bdb2ed7bf320a8772258fdc257655d225afb Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:37 -0600 Subject: [PATCH 029/143] bpf: Add stub for btf_struct_access() Add corresponding unimplemented stub for when CONFIG_BPF_SYSCALL=n Signed-off-by: Daniel Xu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/4021398e884433b1fef57a4d28361bb9fcf1bd05.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48ae05099f36..54178b9e9c3a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2211,6 +2211,15 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) return ERR_PTR(-ENOTSUPP); } +static inline int btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag) +{ + return -EACCES; +} + static inline const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { From 896f07c07da01aa7cee820a23c2bce1d8e9fe1e6 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:38 -0600 Subject: [PATCH 030/143] bpf: Use 0 instead of NOT_INIT for btf_struct_access() writes Returning a bpf_reg_type only makes sense in the context of a BPF_READ. For writes, prefer to explicitly return 0 for clarity. Note that is non-functional change as it just so happened that NOT_INIT == 0. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/01772bc1455ae16600796ac78c6cc9fff34f95ff.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- net/ipv4/bpf_tcp_ca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 85a9e500c42d..6da16ae6a962 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -124,7 +124,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, return -EACCES; } - return NOT_INIT; + return 0; } BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) From 84c6ac417ceacd086efc330afece8922969610b7 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:39 -0600 Subject: [PATCH 031/143] bpf: Export btf_type_by_id() and bpf_log() These symbols will be used in nf_conntrack.ko to support direct writes to `nf_conn`. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/3c98c19dc50d3b18ea5eca135b4fc3a5db036060.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 1 + kernel/bpf/verifier.c | 1 + 2 files changed, 2 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9d12212fcd61..98be25d13325 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -818,6 +818,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) return NULL; return btf->types[type_id]; } +EXPORT_SYMBOL_GPL(btf_type_by_id); /* * Regular int is not a bit field and it must be either diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c3efd461f36c..9109e07b759a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -370,6 +370,7 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, bpf_verifier_vlog(log, fmt, args); va_end(args); } +EXPORT_SYMBOL_GPL(bpf_log); static const char *ltrim(const char *s) { From 864b656f82ccd433d3e38149c3673d295ad64bf6 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:40 -0600 Subject: [PATCH 032/143] bpf: Add support for writing to nf_conn:mark Support direct writes to nf_conn:mark from TC and XDP prog types. This is useful when applications want to store per-connection metadata. This is also particularly useful for applications that run both bpf and iptables/nftables because the latter can trivially access this metadata. One example use case would be if a bpf prog is responsible for advanced packet classification and iptables/nftables is later used for routing due to pre-existing/legacy code. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/ebca06dea366e3e7e861c12f375a548cc4c61108.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/net/netfilter/nf_conntrack_bpf.h | 23 +++++++++ net/core/filter.c | 54 +++++++++++++++++++ net/netfilter/nf_conntrack_bpf.c | 66 +++++++++++++++++++++++- net/netfilter/nf_conntrack_core.c | 1 + 4 files changed, 143 insertions(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index a473b56842c5..a61a93d1c6dc 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -3,13 +3,22 @@ #ifndef _NF_CONNTRACK_BPF_H #define _NF_CONNTRACK_BPF_H +#include #include #include +#include #if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) extern int register_nf_conntrack_bpf(void); +extern void cleanup_nf_conntrack_bpf(void); + +extern struct mutex nf_conn_btf_access_lock; +extern int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); #else @@ -18,6 +27,20 @@ static inline int register_nf_conntrack_bpf(void) return 0; } +static inline void cleanup_nf_conntrack_bpf(void) +{ +} + +static inline int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + return -EACCES; +} + #endif #endif /* _NF_CONNTRACK_BPF_H */ diff --git a/net/core/filter.c b/net/core/filter.c index e872f45399b0..4b2be211bcbe 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -8604,6 +8605,36 @@ static bool tc_cls_act_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } +DEFINE_MUTEX(nf_conn_btf_access_lock); +EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); + +int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); +EXPORT_SYMBOL_GPL(nfct_bsa); + +static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_bsa) + ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -8663,6 +8694,27 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static int xdp_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_bsa) + ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -10557,6 +10609,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, + .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -10568,6 +10621,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, + .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 1cd87b28c9b0..77eb8e959f61 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -6,8 +6,10 @@ * are exposed through to BPF programs is explicitly unstable. */ +#include #include #include +#include #include #include #include @@ -184,6 +186,54 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, return ct; } +BTF_ID_LIST(btf_nf_conn_ids) +BTF_ID(struct, nf_conn) +BTF_ID(struct, nf_conn___init) + +/* Check writes into `struct nf_conn` */ +static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + const struct btf_type *ncit; + const struct btf_type *nct; + size_t end; + + ncit = btf_type_by_id(btf, btf_nf_conn_ids[1]); + nct = btf_type_by_id(btf, btf_nf_conn_ids[0]); + + if (t != nct && t != ncit) { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + /* `struct nf_conn` and `struct nf_conn___init` have the same layout + * so we are safe to simply merge offset checks here + */ + switch (off) { +#if defined(CONFIG_NF_CONNTRACK_MARK) + case offsetof(struct nf_conn, mark): + end = offsetofend(struct nf_conn, mark); + break; +#endif + default: + bpf_log(log, "no write support to nf_conn at off %d\n", off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", + off, size, end); + return -EACCES; + } + + return 0; +} + __diag_push(); __diag_ignore_all("-Wmissing-prototypes", "Global functions as their definitions will be in nf_conntrack BTF"); @@ -449,5 +499,19 @@ int register_nf_conntrack_bpf(void) int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); + if (!ret) { + mutex_lock(&nf_conn_btf_access_lock); + nfct_bsa = _nf_conntrack_btf_struct_access; + mutex_unlock(&nf_conn_btf_access_lock); + } + + return ret; +} + +void cleanup_nf_conntrack_bpf(void) +{ + mutex_lock(&nf_conn_btf_access_lock); + nfct_bsa = NULL; + mutex_unlock(&nf_conn_btf_access_lock); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index da65c6e8eeeb..0195f60fc43b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2512,6 +2512,7 @@ static int kill_all(struct nf_conn *i, void *data) void nf_conntrack_cleanup_start(void) { + cleanup_nf_conntrack_bpf(); conntrack_gc_work.exiting = true; } From e2d75e954c0a277b8fa0ddf666ddd4f9b73195f7 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:41 -0600 Subject: [PATCH 033/143] selftests/bpf: Add tests for writing to nf_conn:mark Add a simple extension to the existing selftest to write to nf_conn:mark. Also add a failure test for writing to unsupported field. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/f78966b81b9349d2b8ebb4cee2caf15cb6b38ee2.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 2 ++ tools/testing/selftests/bpf/progs/test_bpf_nf.c | 9 +++++++-- .../testing/selftests/bpf/progs/test_bpf_nf_fail.c | 14 ++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index 544bf90ac2a7..ab9117ae7545 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -17,6 +17,7 @@ struct { { "set_status_after_insert", "kernel function bpf_ct_set_status args#0 expected pointer to STRUCT nf_conn___init but" }, { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" }, { "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" }, + { "write_not_allowlisted_field", "no write support to nf_conn at off" }, }; enum { @@ -113,6 +114,7 @@ static void test_bpf_nf_ct(int mode) ASSERT_LE(skel->bss->test_delta_timeout, 10, "Test for max ct timeout update"); /* expected status is IPS_SEEN_REPLY */ ASSERT_EQ(skel->bss->test_status, 2, "Test for ct status update "); + ASSERT_EQ(skel->bss->test_insert_lookup_mark, 77, "Test for insert and lookup mark value"); ASSERT_EQ(skel->data->test_exist_lookup, 0, "Test existing connection lookup"); ASSERT_EQ(skel->bss->test_exist_lookup_mark, 43, "Test existing connection lookup ctmark"); end: diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c index 2722441850cc..b5e7079701e8 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c @@ -23,6 +23,7 @@ int test_insert_entry = -EAFNOSUPPORT; int test_succ_lookup = -ENOENT; u32 test_delta_timeout = 0; u32 test_status = 0; +u32 test_insert_lookup_mark = 0; __be32 saddr = 0; __be16 sport = 0; __be32 daddr = 0; @@ -144,6 +145,7 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, bpf_ct_set_timeout(ct, 10000); bpf_ct_set_status(ct, IPS_CONFIRMED); + ct->mark = 77; ct_ins = bpf_ct_insert_entry(ct); if (ct_ins) { @@ -157,6 +159,7 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, test_delta_timeout = ct_lk->timeout - bpf_jiffies64(); test_delta_timeout /= CONFIG_HZ; test_status = IPS_SEEN_REPLY; + test_insert_lookup_mark = ct_lk->mark; bpf_ct_change_status(ct_lk, IPS_SEEN_REPLY); bpf_ct_release(ct_lk); test_succ_lookup = 0; @@ -175,8 +178,10 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, sizeof(opts_def)); if (ct) { test_exist_lookup = 0; - if (ct->mark == 42) - test_exist_lookup_mark = 43; + if (ct->mark == 42) { + ct->mark++; + test_exist_lookup_mark = ct->mark; + } bpf_ct_release(ct); } else { test_exist_lookup = opts_def.error; diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c index bf79af15c808..0e4759ab38ff 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c @@ -69,6 +69,20 @@ int lookup_insert(struct __sk_buff *ctx) return 0; } +SEC("?tc") +int write_not_allowlisted_field(struct __sk_buff *ctx) +{ + struct bpf_ct_opts___local opts = {}; + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + + ct = bpf_skb_ct_lookup(ctx, &tup, sizeof(tup.ipv4), &opts, sizeof(opts)); + if (!ct) + return 0; + ct->status = 0xF00; + return 0; +} + SEC("?tc") int set_timeout_after_insert(struct __sk_buff *ctx) { From f7c946f288e32fd8b5fd69825683420d473672bd Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 8 Sep 2022 10:06:12 +0200 Subject: [PATCH 034/143] selftests/bpf: fix ct status check in bpf_nf selftests Check properly the connection tracking entry status configured running bpf_ct_change_status kfunc. Remove unnecessary IPS_CONFIRMED status configuration since it is already done during entry allocation. Fixes: 6eb7fba007a7 ("selftests/bpf: Add tests for new nf_conntrack kfuncs") Signed-off-by: Lorenzo Bianconi Acked-by: Song Liu Link: https://lore.kernel.org/r/813a5161a71911378dfac8770ec890428e4998aa.1662623574.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 5 +++-- tools/testing/selftests/bpf/progs/test_bpf_nf.c | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index ab9117ae7545..0677a51694c9 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include "test_bpf_nf.skel.h" #include "test_bpf_nf_fail.skel.h" @@ -112,9 +113,9 @@ static void test_bpf_nf_ct(int mode) /* allow some tolerance for test_delta_timeout value to avoid races. */ ASSERT_GT(skel->bss->test_delta_timeout, 8, "Test for min ct timeout update"); ASSERT_LE(skel->bss->test_delta_timeout, 10, "Test for max ct timeout update"); - /* expected status is IPS_SEEN_REPLY */ - ASSERT_EQ(skel->bss->test_status, 2, "Test for ct status update "); ASSERT_EQ(skel->bss->test_insert_lookup_mark, 77, "Test for insert and lookup mark value"); + ASSERT_EQ(skel->bss->test_status, IPS_CONFIRMED | IPS_SEEN_REPLY, + "Test for ct status update "); ASSERT_EQ(skel->data->test_exist_lookup, 0, "Test existing connection lookup"); ASSERT_EQ(skel->bss->test_exist_lookup_mark, 43, "Test existing connection lookup ctmark"); end: diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c index b5e7079701e8..88842da86ddc 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c @@ -144,7 +144,6 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, struct nf_conn *ct_ins; bpf_ct_set_timeout(ct, 10000); - bpf_ct_set_status(ct, IPS_CONFIRMED); ct->mark = 77; ct_ins = bpf_ct_insert_entry(ct); @@ -158,9 +157,11 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, bpf_ct_change_timeout(ct_lk, 10000); test_delta_timeout = ct_lk->timeout - bpf_jiffies64(); test_delta_timeout /= CONFIG_HZ; - test_status = IPS_SEEN_REPLY; test_insert_lookup_mark = ct_lk->mark; - bpf_ct_change_status(ct_lk, IPS_SEEN_REPLY); + bpf_ct_change_status(ct_lk, + IPS_CONFIRMED | IPS_SEEN_REPLY); + test_status = ct_lk->status; + bpf_ct_release(ct_lk); test_succ_lookup = 0; } From 1bfe26fb082724be453e4d7fd9bb358e3ba669b2 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 8 Sep 2022 16:07:16 -0700 Subject: [PATCH 035/143] bpf: Add verifier support for custom callback return range Verifier logic to confirm that a callback function returns 0 or 1 was added in commit 69c087ba6225b ("bpf: Add bpf_for_each_map_elem() helper"). At the time, callback return value was only used to continue or stop iteration. In order to support callbacks with a broader return value range, such as those added in rbtree series[0] and others, add a callback_ret_range to bpf_func_state. Verifier's helpers which set in_callback_fn will also set the new field, which the verifier will later use to check return value bounds. Default to tnum_range(0, 0) instead of using tnum_unknown as a sentinel value as the latter would prevent the valid range (0, U64_MAX) being used. Previous global default tnum_range(0, 1) is explicitly set for extant callback helpers. The change to global default was made after discussion around this patch in rbtree series [1], goal here is to make it more obvious that callback_ret_range should be explicitly set. [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com/ [1]: lore.kernel.org/bpf/20220830172759.4069786-2-davemarchevsky@fb.com/ Signed-off-by: Dave Marchevsky Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220908230716.2751723-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b49a349cc6ae..e197f8fb27e2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -248,6 +248,7 @@ struct bpf_func_state { */ u32 async_entry_cnt; bool in_callback_fn; + struct tnum callback_ret_range; bool in_async_callback_fn; /* The following fields should be last. See copy_func_state() */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9109e07b759a..c259d734f863 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1750,6 +1750,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->callsite = callsite; state->frameno = frameno; state->subprogno = subprogno; + state->callback_ret_range = tnum_range(0, 0); init_reg_state(env, state); mark_verifier_state_scratched(env); } @@ -6790,6 +6791,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return err; callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6811,6 +6813,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6840,6 +6843,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6867,6 +6871,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6894,7 +6899,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller = state->frame[state->curframe]; if (callee->in_callback_fn) { /* enforce R0 return value range [0, 1]. */ - struct tnum range = tnum_range(0, 1); + struct tnum range = callee->callback_ret_range; if (r0->type != SCALAR_VALUE) { verbose(env, "R0 not a scalar value\n"); From 47e34cb74d376ddfeaef94abb1d6dfb3c905ee51 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 12 Sep 2022 08:45:44 -0700 Subject: [PATCH 036/143] bpf: Add verifier check for BPF_PTR_POISON retval and arg BPF_PTR_POISON was added in commit c0a5a21c25f37 ("bpf: Allow storing referenced kptr in map") to denote a bpf_func_proto btf_id which the verifier will replace with a dynamically-determined btf_id at verification time. This patch adds verifier 'poison' functionality to BPF_PTR_POISON in order to prepare for expanded use of the value to poison ret- and arg-btf_id in ongoing work, namely rbtree and linked list patchsets [0, 1]. Specifically, when the verifier checks helper calls, it assumes that BPF_PTR_POISON'ed ret type will be replaced with a valid type before - or in lieu of - the default ret_btf_id logic. Similarly for arg btf_id. If poisoned btf_id reaches default handling block for either, consider this a verifier internal error and fail verification. Otherwise a helper w/ poisoned btf_id but no verifier logic replacing the type will cause a crash as the invalid pointer is dereferenced. Also move BPF_PTR_POISON to existing include/linux/posion.h header and remove unnecessary shift. [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com [1]: lore.kernel.org/bpf/20220904204145.3089-1-memxor@gmail.com Signed-off-by: Dave Marchevsky Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220912154544.1398199-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/poison.h | 3 +++ kernel/bpf/helpers.c | 6 +++--- kernel/bpf/verifier.c | 30 +++++++++++++++++++++++------- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/include/linux/poison.h b/include/linux/poison.h index d62ef5a6b4e9..2d3249eb0e62 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -81,4 +81,7 @@ /********** net/core/page_pool.c **********/ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) +/********** kernel/bpf/ **********/ +#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) + #endif diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fc08035f14ed..41aeaf3862ec 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1376,10 +1377,9 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) } /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() - * helper is determined dynamically by the verifier. + * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to + * denote type that verifier will determine. */ -#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) - static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c259d734f863..8c6fbcd0afaf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "disasm.h" @@ -5782,13 +5783,22 @@ found: if (meta->func_id == BPF_FUNC_kptr_xchg) { if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) return -EACCES; - } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id, - strict_type_match)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf, reg->btf_id), - kernel_type_name(btf_vmlinux, *arg_btf_id)); - return -EACCES; + } else { + if (arg_btf_id == BPF_PTR_POISON) { + verbose(env, "verifier internal error:"); + verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n", + regno); + return -EACCES; + } + + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id, + strict_type_match)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf, reg->btf_id), + kernel_type_name(btf_vmlinux, *arg_btf_id)); + return -EACCES; + } } } @@ -7457,6 +7467,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn ret_btf = meta.kptr_off_desc->kptr.btf; ret_btf_id = meta.kptr_off_desc->kptr.btf_id; } else { + if (fn->ret_btf_id == BPF_PTR_POISON) { + verbose(env, "verifier internal error:"); + verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n", + func_id_name(func_id)); + return -EINVAL; + } ret_btf = btf_vmlinux; ret_btf_id = *fn->ret_btf_id; } From a02c118ee9e898612cbae42121b9e8663455b515 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Tue, 13 Sep 2022 16:40:33 +0800 Subject: [PATCH 037/143] bpf: use kvmemdup_bpfptr helper Use kvmemdup_bpfptr helper instead of open-coding to simplify the code. Signed-off-by: Wang Yufen Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/1663058433-14089-1-git-send-email-wangyufen@huawei.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/syscall.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 69be1c612daa..dab156f09f8d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1417,19 +1417,14 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } value_size = bpf_map_value_size(map); - - err = -ENOMEM; - value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); - if (!value) + value = kvmemdup_bpfptr(uvalue, value_size); + if (IS_ERR(value)) { + err = PTR_ERR(value); goto free_key; - - err = -EFAULT; - if (copy_from_bpfptr(value, uvalue, value_size) != 0) - goto free_value; + } err = bpf_map_update_value(map, f, key, value, attr->flags); -free_value: kvfree(value); free_key: kvfree(key); From bfeb7e399bacae4ee46ad978f5fce3e47f0978d6 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Mon, 5 Sep 2022 12:01:49 +0300 Subject: [PATCH 038/143] bpf: Use bpf_capable() instead of CAP_SYS_ADMIN for blinding decision The full CAP_SYS_ADMIN requirement for blinding looks too strict nowadays. These days given unprivileged BPF is disabled by default, the main users for constant blinding coming from unprivileged in particular via cBPF -> eBPF migration (e.g. old-style socket filters). Signed-off-by: Yauheni Kaliuta Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220831090655.156434-1-ykaliuta@redhat.com Link: https://lore.kernel.org/bpf/20220905090149.61221-1-ykaliuta@redhat.com --- Documentation/admin-guide/sysctl/net.rst | 3 +++ include/linux/filter.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 555681ef6195..6394f5dc2303 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -102,6 +102,9 @@ Values: - 1 - enable JIT hardening for unprivileged users only - 2 - enable JIT hardening for all users +where "privileged user" in this context means a process having +CAP_BPF or CAP_SYS_ADMIN in the root user name space. + bpf_jit_kallsyms ---------------- diff --git a/include/linux/filter.h b/include/linux/filter.h index 527ae1d64e27..75335432fcbc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1099,7 +1099,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN)) + if (bpf_jit_harden == 1 && bpf_capable()) return false; return true; From 9440155ccb948f8e3ce5308907a2e7378799be60 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Sat, 3 Sep 2022 15:11:53 +0200 Subject: [PATCH 039/143] ftrace: Add HAVE_DYNAMIC_FTRACE_NO_PATCHABLE x86 will shortly start using -fpatchable-function-entry for purposes other than ftrace, make sure the __patchable_function_entry section isn't merged in the mcount_loc section. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220903131154.420467-2-jolsa@kernel.org --- include/asm-generic/vmlinux.lds.h | 11 ++++++++++- kernel/trace/Kconfig | 6 ++++++ tools/objtool/check.c | 3 ++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 7515a465ec03..13b197ef0d63 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -154,6 +154,14 @@ #define MEM_DISCARD(sec) *(.mem##sec) #endif +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_NO_PATCHABLE +#define KEEP_PATCHABLE KEEP(*(__patchable_function_entries)) +#define PATCHABLE_DISCARDS +#else +#define KEEP_PATCHABLE +#define PATCHABLE_DISCARDS *(__patchable_function_entries) +#endif + #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* * The ftrace call sites are logged to a section whose name depends on the @@ -172,7 +180,7 @@ #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ - KEEP(*(__patchable_function_entries)) \ + KEEP_PATCHABLE \ __stop_mcount_loc = .; \ ftrace_stub_graph = ftrace_stub; \ ftrace_ops_list_func = arch_ftrace_ops_list_func; @@ -1024,6 +1032,7 @@ #define COMMON_DISCARDS \ SANITIZER_DISCARDS \ + PATCHABLE_DISCARDS \ *(.discard) \ *(.discard.*) \ *(.modinfo) \ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1052126bdca2..e9e95c790b8e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -51,6 +51,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS This allows for use of regs_get_kernel_argument() and kernel_stack_pointer(). +config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + bool + help + If the architecture generates __patchable_function_entries sections + but does not want them included in the ftrace locations. + config HAVE_FTRACE_MCOUNT_RECORD bool help diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e55fdf952a3a..9216060c3408 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4113,7 +4113,8 @@ static int validate_ibt(struct objtool_file *file) !strcmp(sec->name, "__bug_table") || !strcmp(sec->name, "__ex_table") || !strcmp(sec->name, "__jump_table") || - !strcmp(sec->name, "__mcount_loc")) + !strcmp(sec->name, "__mcount_loc") || + strstr(sec->name, "__patchable_function_entries")) continue; list_for_each_entry(reloc, &sec->reloc->reloc_list, list) From ceea991a019c57a1fb0edd12a5f836a0fa431aee Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 3 Sep 2022 15:11:54 +0200 Subject: [PATCH 040/143] bpf: Move bpf_dispatcher function out of ftrace locations The dispatcher function is attached/detached to trampoline by dispatcher update function. At the same time it's available as ftrace attachable function. After discussion [1] the proposed solution is to use compiler attributes to alter bpf_dispatcher_##name##_func function: - remove it from being instrumented with __no_instrument_function__ attribute, so ftrace has no track of it - but still generate 5 nop instructions with patchable_function_entry(5) attribute, which are expected by bpf_arch_text_poke used by dispatcher update function Enabling HAVE_DYNAMIC_FTRACE_NO_PATCHABLE option for x86, so __patchable_function_entries functions are not part of ftrace/mcount locations. Adding attributes to bpf_dispatcher_XXX function on x86_64 so it's kept out of ftrace locations and has 5 byte nop generated at entry. These attributes need to be arch specific as pointed out by Ilya Leoshkevic in here [2]. The dispatcher image is generated only for x86_64 arch, so the code can stay as is for other archs. [1] https://lore.kernel.org/bpf/20220722110811.124515-1-jolsa@kernel.org/ [2] https://lore.kernel.org/bpf/969a14281a7791c334d476825863ee449964dd0c.camel@linux.ibm.com/ Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20220903131154.420467-3-jolsa@kernel.org --- arch/x86/Kconfig | 1 + include/linux/bpf.h | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f9920f1341c8..089c20cefd2b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -284,6 +284,7 @@ config X86 select PROC_PID_ARCH_STATUS if PROC_FS select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI + select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE config INSTRUCTION_DECODER def_bool y diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 54178b9e9c3a..e0dbe0c0a17e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -977,7 +977,14 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); }, \ } +#ifdef CONFIG_X86_64 +#define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) +#else +#define BPF_DISPATCHER_ATTRIBUTES +#endif + #define DEFINE_BPF_DISPATCHER(name) \ + notrace BPF_DISPATCHER_ATTRIBUTES \ noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ From cf060c2c399fa457569123bb9806b455ff53e64c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 9 Sep 2022 12:30:51 -0700 Subject: [PATCH 041/143] selftests/bpf: Fix test_verif_scale{1,3} SEC() annotations Use proper SEC("tc") for test_verif_scale{1,3} programs. It's not a problem for selftests right now because we manually set type programmatically, but not having correct SEC() definitions makes it harded to generically load BPF object files. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220909193053.577111-2-andrii@kernel.org --- tools/testing/selftests/bpf/progs/test_verif_scale1.c | 2 +- tools/testing/selftests/bpf/progs/test_verif_scale3.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale1.c b/tools/testing/selftests/bpf/progs/test_verif_scale1.c index d38153dab3dd..ac6135d9374c 100644 --- a/tools/testing/selftests/bpf/progs/test_verif_scale1.c +++ b/tools/testing/selftests/bpf/progs/test_verif_scale1.c @@ -5,7 +5,7 @@ #define ATTR __attribute__((noinline)) #include "test_jhash.h" -SEC("scale90_noinline") +SEC("tc") int balancer_ingress(struct __sk_buff *ctx) { void *data_end = (void *)(long)ctx->data_end; diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale3.c b/tools/testing/selftests/bpf/progs/test_verif_scale3.c index 9beb5bf80373..ca33a9b711c4 100644 --- a/tools/testing/selftests/bpf/progs/test_verif_scale3.c +++ b/tools/testing/selftests/bpf/progs/test_verif_scale3.c @@ -5,7 +5,7 @@ #define ATTR __attribute__((noinline)) #include "test_jhash.h" -SEC("scale90_noinline32") +SEC("tc") int balancer_ingress(struct __sk_buff *ctx) { void *data_end = (void *)(long)ctx->data_end; From 749c202cb6ea40f4d7ac95c4a1217a7b506f43a8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 9 Sep 2022 12:30:52 -0700 Subject: [PATCH 042/143] libbpf: Fix crash if SEC("freplace") programs don't have attach_prog_fd set Fix SIGSEGV caused by libbpf trying to find attach type in vmlinux BTF for freplace programs. It's wrong to search in vmlinux BTF and libbpf doesn't even mark vmlinux BTF as required for freplace programs. So trying to search anything in obj->vmlinux_btf might cause NULL dereference if nothing else in BPF object requires vmlinux BTF. Instead, error out if freplace (EXT) program doesn't specify attach_prog_fd during at the load time. Fixes: 91abb4a6d79d ("libbpf: Support attachment of BPF tracing programs to kernel modules") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220909193053.577111-3-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3ad139285fad..2ca30ccc774c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9084,11 +9084,15 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac int err = 0; /* BPF program's BTF ID */ - if (attach_prog_fd) { + if (prog->type == BPF_PROG_TYPE_EXT || attach_prog_fd) { + if (!attach_prog_fd) { + pr_warn("prog '%s': attach program FD is not set\n", prog->name); + return -EINVAL; + } err = libbpf_find_prog_btf_id(attach_name, attach_prog_fd); if (err < 0) { - pr_warn("failed to find BPF program (FD %d) BTF ID for '%s': %d\n", - attach_prog_fd, attach_name, err); + pr_warn("prog '%s': failed to find BPF program (FD %d) BTF ID for '%s': %d\n", + prog->name, attach_prog_fd, attach_name, err); return err; } *btf_obj_fd = 0; @@ -9105,7 +9109,8 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id); } if (err) { - pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err); + pr_warn("prog '%s': failed to find kernel BTF type ID of '%s': %d\n", + prog->name, attach_name, err); return err; } return 0; From c8bc5e0509767e51b35ae2f4af6ff90fa6a5f27f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 9 Sep 2022 12:30:53 -0700 Subject: [PATCH 043/143] selftests/bpf: Add veristat tool for mass-verifying BPF object files Add a small tool, veristat, that allows mass-verification of a set of *libbpf-compatible* BPF ELF object files. For each such object file, veristat will attempt to verify each BPF program *individually*. Regardless of success or failure, it parses BPF verifier stats and outputs them in human-readable table format. In the future we can also add CSV and JSON output for more scriptable post-processing, if necessary. veristat allows to specify a set of stats that should be output and ordering between multiple objects and files (e.g., so that one can easily order by total instructions processed, instead of default file name, prog name, verdict, total instructions order). This tool should be useful for validating various BPF verifier changes or even validating different kernel versions for regressions. Here's an example for some of the heaviest selftests/bpf BPF object files: $ sudo ./veristat -s insns,file,prog {pyperf,loop,test_verif_scale,strobemeta,test_cls_redirect,profiler}*.linked3.o File Program Verdict Duration, us Total insns Total states Peak states ------------------------------------ ------------------------------------ ------- ------------ ----------- ------------ ----------- loop3.linked3.o while_true failure 350990 1000001 9663 9663 test_verif_scale3.linked3.o balancer_ingress success 115244 845499 8636 2141 test_verif_scale2.linked3.o balancer_ingress success 77688 773445 3048 788 pyperf600.linked3.o on_event success 2079872 624585 30335 30241 pyperf600_nounroll.linked3.o on_event success 353972 568128 37101 2115 strobemeta.linked3.o on_event success 455230 557149 15915 13537 test_verif_scale1.linked3.o balancer_ingress success 89880 554754 8636 2141 strobemeta_nounroll2.linked3.o on_event success 433906 501725 17087 1912 loop6.linked3.o trace_virtqueue_add_sgs success 282205 398057 8717 919 loop1.linked3.o nested_loops success 125630 361349 5504 5504 pyperf180.linked3.o on_event success 2511740 160398 11470 11446 pyperf100.linked3.o on_event success 744329 87681 6213 6191 test_cls_redirect.linked3.o cls_redirect success 54087 78925 4782 903 strobemeta_subprogs.linked3.o on_event success 57898 65420 1954 403 test_cls_redirect_subprogs.linked3.o cls_redirect success 54522 64965 4619 958 strobemeta_nounroll1.linked3.o on_event success 43313 57240 1757 382 pyperf50.linked3.o on_event success 194355 46378 3263 3241 profiler2.linked3.o tracepoint__syscalls__sys_enter_kill success 23869 43372 1423 542 pyperf_subprogs.linked3.o on_event success 29179 36358 2499 2499 profiler1.linked3.o tracepoint__syscalls__sys_enter_kill success 13052 27036 1946 936 profiler3.linked3.o tracepoint__syscalls__sys_enter_kill success 21023 26016 2186 915 profiler2.linked3.o kprobe__vfs_link success 5255 13896 303 271 profiler1.linked3.o kprobe__vfs_link success 7792 12687 1042 1041 profiler3.linked3.o kprobe__vfs_link success 7332 10601 865 865 profiler2.linked3.o kprobe_ret__do_filp_open success 3417 8900 216 199 profiler2.linked3.o kprobe__vfs_symlink success 3548 8775 203 186 pyperf_global.linked3.o on_event success 10007 7563 520 520 profiler3.linked3.o kprobe_ret__do_filp_open success 4708 6464 532 532 profiler1.linked3.o kprobe_ret__do_filp_open success 3090 6445 508 508 profiler3.linked3.o kprobe__vfs_symlink success 4477 6358 521 521 profiler1.linked3.o kprobe__vfs_symlink success 3381 6347 507 507 profiler2.linked3.o raw_tracepoint__sched_process_exec success 2464 5874 292 189 profiler3.linked3.o raw_tracepoint__sched_process_exec success 2677 4363 397 283 profiler2.linked3.o kprobe__proc_sys_write success 1800 4355 143 138 profiler1.linked3.o raw_tracepoint__sched_process_exec success 1649 4019 333 240 pyperf600_bpf_loop.linked3.o on_event success 2711 3966 306 306 profiler2.linked3.o raw_tracepoint__sched_process_exit success 1234 3138 83 66 profiler3.linked3.o kprobe__proc_sys_write success 1755 2623 223 223 profiler1.linked3.o kprobe__proc_sys_write success 1222 2456 193 193 loop2.linked3.o while_true success 608 1783 57 30 profiler3.linked3.o raw_tracepoint__sched_process_exit success 789 1680 146 146 profiler1.linked3.o raw_tracepoint__sched_process_exit success 592 1526 133 133 strobemeta_bpf_loop.linked3.o on_event success 1015 1512 106 106 loop4.linked3.o combinations success 165 524 18 17 profiler3.linked3.o raw_tracepoint__sched_process_fork success 196 299 25 25 profiler1.linked3.o raw_tracepoint__sched_process_fork success 109 265 19 19 profiler2.linked3.o raw_tracepoint__sched_process_fork success 111 265 19 19 loop5.linked3.o while_true success 47 84 9 9 ------------------------------------ ------------------------------------ ------- ------------ ----------- ------------ ----------- Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220909193053.577111-4-andrii@kernel.org --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 7 +- tools/testing/selftests/bpf/veristat.c | 537 +++++++++++++++++++++++++ 3 files changed, 544 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/veristat.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 3a8cb2404ea6..3b288562963e 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -39,6 +39,7 @@ test_cpp /tools /runqslower /bench +/veristat *.ko *.tmp xskxceiver diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6cd327f1f216..1a0296bd744a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -82,7 +82,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ - xskxceiver xdp_redirect_multi xdp_synproxy + xskxceiver xdp_redirect_multi xdp_synproxy veristat TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read @@ -595,6 +595,11 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ +$(OUTPUT)/veristat.o: $(BPFOBJ) +$(OUTPUT)/veristat: $(OUTPUT)/veristat.o + $(call msg,BINARY,,$@) + $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ + EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature bpftool \ diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c new file mode 100644 index 000000000000..39e6dc41e504 --- /dev/null +++ b/tools/testing/selftests/bpf/veristat.c @@ -0,0 +1,537 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum stat_id { + VERDICT, + DURATION, + TOTAL_INSNS, + TOTAL_STATES, + PEAK_STATES, + MAX_STATES_PER_INSN, + MARK_READ_MAX_LEN, + + FILE_NAME, + PROG_NAME, + + ALL_STATS_CNT, + NUM_STATS_CNT = FILE_NAME - VERDICT, +}; + +struct verif_stats { + char *file_name; + char *prog_name; + + long stats[NUM_STATS_CNT]; +}; + +struct stat_specs { + int spec_cnt; + enum stat_id ids[ALL_STATS_CNT]; + bool asc[ALL_STATS_CNT]; + int lens[ALL_STATS_CNT]; +}; + +static struct env { + char **filenames; + int filename_cnt; + bool verbose; + + struct verif_stats *prog_stats; + int prog_stat_cnt; + + struct stat_specs output_spec; + struct stat_specs sort_spec; +} env; + +static int libbpf_print_fn(enum libbpf_print_level level, + const char *format, va_list args) +{ + if (!env.verbose) + return 0; + if (level == LIBBPF_DEBUG /* && !env.verbose */) + return 0; + return vfprintf(stderr, format, args); +} + +const char *argp_program_version = "veristat"; +const char *argp_program_bug_address = ""; +const char argp_program_doc[] = +"veristat BPF verifier stats collection tool.\n" +"\n" +"USAGE: veristat [...]\n"; + +static const struct argp_option opts[] = { + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, + { "verbose", 'v', NULL, 0, "Verbose mode" }, + { "output", 'o', "SPEC", 0, "Specify output stats" }, + { "sort", 's', "SPEC", 0, "Specify sort order" }, + {}, +}; + +static int parse_stats(const char *stats_str, struct stat_specs *specs); + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + void *tmp; + int err; + + switch (key) { + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + case 'v': + env.verbose = true; + break; + case 'o': + err = parse_stats(arg, &env.output_spec); + if (err) + return err; + break; + case 's': + err = parse_stats(arg, &env.sort_spec); + if (err) + return err; + break; + case ARGP_KEY_ARG: + tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames)); + if (!tmp) + return -ENOMEM; + env.filenames = tmp; + env.filenames[env.filename_cnt] = strdup(arg); + if (!env.filenames[env.filename_cnt]) + return -ENOMEM; + env.filename_cnt++; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static const struct stat_specs default_output_spec = { + .spec_cnt = 7, + .ids = { + FILE_NAME, PROG_NAME, VERDICT, DURATION, + TOTAL_INSNS, TOTAL_STATES, PEAK_STATES, + }, +}; + +static const struct stat_specs default_sort_spec = { + .spec_cnt = 2, + .ids = { + FILE_NAME, PROG_NAME, + }, + .asc = { true, true, }, +}; + +static struct stat_def { + const char *header; + const char *names[4]; + bool asc_by_default; +} stat_defs[] = { + [FILE_NAME] = { "File", {"file_name", "filename", "file"}, true /* asc */ }, + [PROG_NAME] = { "Program", {"prog_name", "progname", "prog"}, true /* asc */ }, + [VERDICT] = { "Verdict", {"verdict"}, true /* asc: failure, success */ }, + [DURATION] = { "Duration, us", {"duration", "dur"}, }, + [TOTAL_INSNS] = { "Total insns", {"total_insns", "insns"}, }, + [TOTAL_STATES] = { "Total states", {"total_states", "states"}, }, + [PEAK_STATES] = { "Peak states", {"peak_states"}, }, + [MAX_STATES_PER_INSN] = { "Max states per insn", {"max_states_per_insn"}, }, + [MARK_READ_MAX_LEN] = { "Max mark read length", {"max_mark_read_len", "mark_read"}, }, +}; + +static int parse_stat(const char *stat_name, struct stat_specs *specs) +{ + int id, i; + + if (specs->spec_cnt >= ARRAY_SIZE(specs->ids)) { + fprintf(stderr, "Can't specify more than %zd stats\n", ARRAY_SIZE(specs->ids)); + return -E2BIG; + } + + for (id = 0; id < ARRAY_SIZE(stat_defs); id++) { + struct stat_def *def = &stat_defs[id]; + + for (i = 0; i < ARRAY_SIZE(stat_defs[id].names); i++) { + if (!def->names[i] || strcmp(def->names[i], stat_name) != 0) + continue; + + specs->ids[specs->spec_cnt] = id; + specs->asc[specs->spec_cnt] = def->asc_by_default; + specs->spec_cnt++; + + return 0; + } + } + + fprintf(stderr, "Unrecognized stat name '%s'\n", stat_name); + return -ESRCH; +} + +static int parse_stats(const char *stats_str, struct stat_specs *specs) +{ + char *input, *state = NULL, *next; + int err; + + input = strdup(stats_str); + if (!input) + return -ENOMEM; + + while ((next = strtok_r(state ? NULL : input, ",", &state))) { + err = parse_stat(next, specs); + if (err) + return err; + } + + return 0; +} + +static char verif_log_buf[64 * 1024]; + +static int parse_verif_log(const char *buf, size_t buf_sz, struct verif_stats *s) +{ + const char *next; + int pos; + + for (pos = 0; buf[0]; buf = next) { + if (buf[0] == '\n') + buf++; + next = strchrnul(&buf[pos], '\n'); + + if (1 == sscanf(buf, "verification time %ld usec\n", &s->stats[DURATION])) + continue; + if (6 == sscanf(buf, "processed %ld insns (limit %*d) max_states_per_insn %ld total_states %ld peak_states %ld mark_read %ld", + &s->stats[TOTAL_INSNS], + &s->stats[MAX_STATES_PER_INSN], + &s->stats[TOTAL_STATES], + &s->stats[PEAK_STATES], + &s->stats[MARK_READ_MAX_LEN])) + continue; + } + + return 0; +} + +static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog) +{ + const char *prog_name = bpf_program__name(prog); + size_t buf_sz = sizeof(verif_log_buf); + char *buf = verif_log_buf; + struct verif_stats *stats; + int err = 0; + void *tmp; + + tmp = realloc(env.prog_stats, (env.prog_stat_cnt + 1) * sizeof(*env.prog_stats)); + if (!tmp) + return -ENOMEM; + env.prog_stats = tmp; + stats = &env.prog_stats[env.prog_stat_cnt++]; + memset(stats, 0, sizeof(*stats)); + + if (env.verbose) { + buf_sz = 16 * 1024 * 1024; + buf = malloc(buf_sz); + if (!buf) + return -ENOMEM; + bpf_program__set_log_buf(prog, buf, buf_sz); + bpf_program__set_log_level(prog, 1 | 4); /* stats + log */ + } else { + bpf_program__set_log_buf(prog, buf, buf_sz); + bpf_program__set_log_level(prog, 4); /* only verifier stats */ + } + verif_log_buf[0] = '\0'; + + err = bpf_object__load(obj); + + stats->file_name = strdup(basename(filename)); + stats->prog_name = strdup(bpf_program__name(prog)); + stats->stats[VERDICT] = err == 0; /* 1 - success, 0 - failure */ + parse_verif_log(buf, buf_sz, stats); + + if (env.verbose) { + printf("PROCESSING %s/%s, DURATION US: %ld, VERDICT: %s, VERIFIER LOG:\n%s\n", + filename, prog_name, stats->stats[DURATION], + err ? "failure" : "success", buf); + } + + if (verif_log_buf != buf) + free(buf); + + return 0; +}; + +static int process_obj(const char *filename) +{ + struct bpf_object *obj = NULL, *tobj; + struct bpf_program *prog, *tprog, *lprog; + libbpf_print_fn_t old_libbpf_print_fn; + LIBBPF_OPTS(bpf_object_open_opts, opts); + int err = 0, prog_cnt = 0; + + old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn); + + obj = bpf_object__open_file(filename, &opts); + if (!obj) { + err = -errno; + fprintf(stderr, "Failed to open '%s': %d\n", filename, err); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + prog_cnt++; + } + + if (prog_cnt == 1) { + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_autoload(prog, true); + process_prog(filename, obj, prog); + bpf_object__close(obj); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + const char *prog_name = bpf_program__name(prog); + + tobj = bpf_object__open_file(filename, &opts); + if (!tobj) { + err = -errno; + fprintf(stderr, "Failed to open '%s': %d\n", filename, err); + goto cleanup; + } + + bpf_object__for_each_program(tprog, tobj) { + const char *tprog_name = bpf_program__name(tprog); + + if (strcmp(prog_name, tprog_name) == 0) { + bpf_program__set_autoload(tprog, true); + lprog = tprog; + } else { + bpf_program__set_autoload(tprog, false); + } + } + + process_prog(filename, tobj, lprog); + bpf_object__close(tobj); + } + +cleanup: + bpf_object__close(obj); + libbpf_set_print(old_libbpf_print_fn); + return err; +} + +static int cmp_stat(const struct verif_stats *s1, const struct verif_stats *s2, + enum stat_id id, bool asc) +{ + int cmp = 0; + + switch (id) { + case FILE_NAME: + cmp = strcmp(s1->file_name, s2->file_name); + break; + case PROG_NAME: + cmp = strcmp(s1->prog_name, s2->prog_name); + break; + case VERDICT: + case DURATION: + case TOTAL_INSNS: + case TOTAL_STATES: + case PEAK_STATES: + case MAX_STATES_PER_INSN: + case MARK_READ_MAX_LEN: { + long v1 = s1->stats[id]; + long v2 = s2->stats[id]; + + if (v1 != v2) + cmp = v1 < v2 ? -1 : 1; + break; + } + default: + fprintf(stderr, "Unrecognized stat #%d\n", id); + exit(1); + } + + return asc ? cmp : -cmp; +} + +static int cmp_prog_stats(const void *v1, const void *v2) +{ + const struct verif_stats *s1 = v1, *s2 = v2; + int i, cmp; + + for (i = 0; i < env.sort_spec.spec_cnt; i++) { + cmp = cmp_stat(s1, s2, env.sort_spec.ids[i], env.sort_spec.asc[i]); + if (cmp != 0) + return cmp; + } + + return 0; +} + +#define HEADER_CHAR '-' +#define COLUMN_SEP " " + +static void output_headers(bool calc_len) +{ + int i, len; + + for (i = 0; i < env.output_spec.spec_cnt; i++) { + int id = env.output_spec.ids[i]; + int *max_len = &env.output_spec.lens[i]; + + if (calc_len) { + len = snprintf(NULL, 0, "%s", stat_defs[id].header); + if (len > *max_len) + *max_len = len; + } else { + printf("%s%-*s", i == 0 ? "" : COLUMN_SEP, *max_len, stat_defs[id].header); + } + } + + if (!calc_len) + printf("\n"); +} + +static void output_header_underlines(void) +{ + int i, j, len; + + for (i = 0; i < env.output_spec.spec_cnt; i++) { + len = env.output_spec.lens[i]; + + printf("%s", i == 0 ? "" : COLUMN_SEP); + for (j = 0; j < len; j++) + printf("%c", HEADER_CHAR); + } + printf("\n"); +} + +static void output_stats(const struct verif_stats *s, bool calc_len) +{ + int i; + + for (i = 0; i < env.output_spec.spec_cnt; i++) { + int id = env.output_spec.ids[i]; + int *max_len = &env.output_spec.lens[i], len; + const char *str = NULL; + long val = 0; + + switch (id) { + case FILE_NAME: + str = s->file_name; + break; + case PROG_NAME: + str = s->prog_name; + break; + case VERDICT: + str = s->stats[VERDICT] ? "success" : "failure"; + break; + case DURATION: + case TOTAL_INSNS: + case TOTAL_STATES: + case PEAK_STATES: + case MAX_STATES_PER_INSN: + case MARK_READ_MAX_LEN: + val = s->stats[id]; + break; + default: + fprintf(stderr, "Unrecognized stat #%d\n", id); + exit(1); + } + + if (calc_len) { + if (str) + len = snprintf(NULL, 0, "%s", str); + else + len = snprintf(NULL, 0, "%ld", val); + if (len > *max_len) + *max_len = len; + } else { + if (str) + printf("%s%-*s", i == 0 ? "" : COLUMN_SEP, *max_len, str); + else + printf("%s%*ld", i == 0 ? "" : COLUMN_SEP, *max_len, val); + } + } + + if (!calc_len) + printf("\n"); +} + +int main(int argc, char **argv) +{ + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + }; + int err = 0, i; + + if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) + return 1; + + if (env.filename_cnt == 0) { + fprintf(stderr, "Please provide path to BPF object file!\n"); + argp_help(&argp, stderr, ARGP_HELP_USAGE, "veristat"); + return 1; + } + + if (env.output_spec.spec_cnt == 0) + env.output_spec = default_output_spec; + if (env.sort_spec.spec_cnt == 0) + env.sort_spec = default_sort_spec; + + for (i = 0; i < env.filename_cnt; i++) { + err = process_obj(env.filenames[i]); + if (err) { + fprintf(stderr, "Failed to process '%s': %d\n", env.filenames[i], err); + goto cleanup; + } + } + + qsort(env.prog_stats, env.prog_stat_cnt, sizeof(*env.prog_stats), cmp_prog_stats); + + /* calculate column widths */ + output_headers(true); + for (i = 0; i < env.prog_stat_cnt; i++) { + output_stats(&env.prog_stats[i], true); + } + + /* actually output the table */ + output_headers(false); + output_header_underlines(); + for (i = 0; i < env.prog_stat_cnt; i++) { + output_stats(&env.prog_stats[i], false); + } + output_header_underlines(); + printf("\n"); + + printf("Done. Processed %d object files, %d programs.\n", + env.filename_cnt, env.prog_stat_cnt); + +cleanup: + for (i = 0; i < env.prog_stat_cnt; i++) { + free(env.prog_stats[i].file_name); + free(env.prog_stats[i].prog_name); + } + free(env.prog_stats); + for (i = 0; i < env.filename_cnt; i++) + free(env.filenames[i]); + free(env.filenames); + return -err; +} From dc567045f1590f6460d3e9a6ea6ad5e600b58b84 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Tue, 13 Sep 2022 15:36:43 +0800 Subject: [PATCH 044/143] libbpf: Clean up legacy bpf maps declaration in bpf_helpers Legacy BPF map declarations are no longer supported in libbpf v1.0 [0]. Only BTF-defined maps are supported starting from v1.0, so it is time to remove the definition of bpf_map_def in bpf_helpers.h. [0] https://github.com/libbpf/libbpf/wiki/Libbpf:-the-road-to-v1.0 Signed-off-by: Xin Liu Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220913073643.19960-1-liuxin350@huawei.com --- tools/lib/bpf/bpf_helpers.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 7349b16b8e2f..d37c4fe2849d 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -160,18 +160,6 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) } #endif -/* - * Helper structure used by eBPF C program - * to describe BPF map attributes to libbpf loader - */ -struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; -} __attribute__((deprecated("use BTF-defined maps in .maps section"))); - enum libbpf_pin_type { LIBBPF_PIN_NONE, /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ From 571f9738bfb3d4b42253c1d0ad26da9fede85f36 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Fri, 16 Sep 2022 13:28:00 -0700 Subject: [PATCH 045/143] bpf/btf: Use btf_type_str() whenever possible We have btf_type_str(). Use it whenever possible in btf.c, instead of "btf_kind_str[BTF_INFO_KIND(t->info)]". Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/20220916202800.31421-1-yepeilin.cs@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 98be25d13325..b3940c605aac 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1397,7 +1397,6 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; - u8 kind = BTF_INFO_KIND(t->info); struct btf *btf = env->btf; va_list args; @@ -1413,7 +1412,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, - btf_kind_str[kind], + btf_type_str(t), __btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); @@ -5427,7 +5426,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", - btf_kind_str[BTF_INFO_KIND(t->info)]); + btf_type_str(t)); return false; } break; @@ -5454,7 +5453,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, __btf_name_by_offset(btf, t->name_off), - btf_kind_str[BTF_INFO_KIND(t->info)]); + btf_type_str(t)); return false; } @@ -5538,11 +5537,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", - tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, arg, btf_type_str(t)); return false; } bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", - tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], + tname, arg, info->btf_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); return true; } @@ -5950,7 +5949,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, if (ret < 0 || __btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", - tname, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, btf_type_str(t)); return -EINVAL; } m->ret_size = ret; @@ -5968,7 +5967,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, if (ret < 0 || ret > 16) { bpf_log(log, "The function %s arg%d type %s is unsupported.\n", - tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, i, btf_type_str(t)); return -EINVAL; } if (ret == 0) { @@ -6727,7 +6726,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", - i, btf_kind_str[BTF_INFO_KIND(t->info)], tname); + i, btf_type_str(t), tname); return -EINVAL; } return 0; From a7e85406bdbd0c376f3997e571f7073b9527272e Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 19 Sep 2022 11:57:14 +0800 Subject: [PATCH 046/143] selftests/bpf: Add test result messages for test_task_storage_map_stress_lookup Add test result message when test_task_storage_map_stress_lookup() succeeds or is skipped. The test case can be skipped due to the choose of preemption model in kernel config, so export skips in test_maps.c and increase it when needed. The following is the output of test_maps when the test case succeeds or is skipped: test_task_storage_map_stress_lookup:PASS test_maps: OK, 0 SKIPPED test_task_storage_map_stress_lookup SKIP (no CONFIG_PREEMPT) test_maps: OK, 1 SKIPPED Fixes: 73b97bc78b32 ("selftests/bpf: Test concurrent updates on bpf_task_storage_busy") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220919035714.2195144-1-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/map_tests/task_storage_map.c | 6 +++++- tools/testing/selftests/bpf/test_maps.c | 2 +- tools/testing/selftests/bpf/test_maps.h | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c index 1adc9c292eb2..aac08c85240b 100644 --- a/tools/testing/selftests/bpf/map_tests/task_storage_map.c +++ b/tools/testing/selftests/bpf/map_tests/task_storage_map.c @@ -77,8 +77,11 @@ void test_task_storage_map_stress_lookup(void) CHECK(err, "open_and_load", "error %d\n", err); /* Only for a fully preemptible kernel */ - if (!skel->kconfig->CONFIG_PREEMPT) + if (!skel->kconfig->CONFIG_PREEMPT) { + printf("%s SKIP (no CONFIG_PREEMPT)\n", __func__); + skips++; return; + } /* Save the old affinity setting */ sched_getaffinity(getpid(), sizeof(old), &old); @@ -119,4 +122,5 @@ out: read_bpf_task_storage_busy__destroy(skel); /* Restore affinity setting */ sched_setaffinity(getpid(), sizeof(old), &old); + printf("%s:PASS\n", __func__); } diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 00b9cc305e58..289ff310e283 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -30,7 +30,7 @@ #define ENOTSUPP 524 #endif -static int skips; +int skips; static struct bpf_map_create_opts map_opts = { .sz = sizeof(map_opts) }; diff --git a/tools/testing/selftests/bpf/test_maps.h b/tools/testing/selftests/bpf/test_maps.h index 77d8587ac4ed..f6fbca761732 100644 --- a/tools/testing/selftests/bpf/test_maps.h +++ b/tools/testing/selftests/bpf/test_maps.h @@ -14,4 +14,6 @@ } \ }) +extern int skips; + #endif From c31b38cb948ee7d3317139f005fa1f90de4a06b7 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 19 Sep 2022 22:48:11 +0800 Subject: [PATCH 047/143] bpf: Check whether or not node is NULL before free it in free_bulk llnode could be NULL if there are new allocations after the checking of c-free_cnt > c->high_watermark in bpf_mem_refill() and before the calling of __llist_del_first() in free_bulk (e.g. a PREEMPT_RT kernel or allocation in NMI context). And it will incur oops as shown below: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] PREEMPT_RT SMP CPU: 39 PID: 373 Comm: irq_work/39 Tainted: G W 6.0.0-rc6-rt9+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:bpf_mem_refill+0x66/0x130 ...... Call Trace: irq_work_single+0x24/0x60 irq_work_run_list+0x24/0x30 run_irq_workd+0x18/0x20 smpboot_thread_fn+0x13f/0x2c0 kthread+0x121/0x140 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Simply fixing it by checking whether or not llnode is NULL in free_bulk(). Fixes: 8d5a8011b35d ("bpf: Batch call_rcu callbacks instead of SLAB_TYPESAFE_BY_RCU.") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220919144811.3570825-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 20621f5407d8..5f83be1d2018 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -277,7 +277,8 @@ static void free_bulk(struct bpf_mem_cache *c) local_dec(&c->active); if (IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(flags); - enque_to_free(c, llnode); + if (llnode) + enque_to_free(c, llnode); } while (cnt > (c->high_watermark + c->low_watermark) / 2); /* and drain free_llist_extra */ From 52bdae37c92ae10d47d54bd7cd39e0a17547ebfa Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 20 Sep 2022 08:15:22 -0600 Subject: [PATCH 048/143] bpf: Remove unused btf_struct_access stub This stub was not being used anywhere. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/590e7bd6172ffe0f3d7b51cd40e8ded941aaf7e8.1663683114.git.dxu@dxuuu.xyz Signed-off-by: Martin KaFai Lau --- include/net/netfilter/nf_conntrack_bpf.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index a61a93d1c6dc..9c07d2d59da5 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -3,8 +3,6 @@ #ifndef _NF_CONNTRACK_BPF_H #define _NF_CONNTRACK_BPF_H -#include -#include #include #include @@ -31,16 +29,6 @@ static inline void cleanup_nf_conntrack_bpf(void) { } -static inline int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, - int size, enum bpf_access_type atype, - u32 *next_btf_id, - enum bpf_type_flag *flag) -{ - return -EACCES; -} - #endif #endif /* _NF_CONNTRACK_BPF_H */ From 5a090aa35038e3dad1ee334e3c509c39e7599bb4 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 20 Sep 2022 08:15:23 -0600 Subject: [PATCH 049/143] bpf: Rename nfct_bsa to nfct_btf_struct_access The former name was a little hard to guess. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/73adc72385c8b162391fbfb404f0b6d4c5cc55d7.1663683114.git.dxu@dxuuu.xyz Signed-off-by: Martin KaFai Lau --- include/net/netfilter/nf_conntrack_bpf.h | 8 ++++---- net/core/filter.c | 18 +++++++++--------- net/netfilter/nf_conntrack_bpf.c | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index 9c07d2d59da5..1199d4f8e019 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -13,10 +13,10 @@ extern int register_nf_conntrack_bpf(void); extern void cleanup_nf_conntrack_bpf(void); extern struct mutex nf_conn_btf_access_lock; -extern int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); #else diff --git a/net/core/filter.c b/net/core/filter.c index 4b2be211bcbe..2fd9449026aa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8608,11 +8608,11 @@ static bool tc_cls_act_is_valid_access(int off, int size, DEFINE_MUTEX(nf_conn_btf_access_lock); EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); -int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); -EXPORT_SYMBOL_GPL(nfct_bsa); +int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); +EXPORT_SYMBOL_GPL(nfct_btf_struct_access); static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, @@ -8628,8 +8628,8 @@ static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, flag); mutex_lock(&nf_conn_btf_access_lock); - if (nfct_bsa) - ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); mutex_unlock(&nf_conn_btf_access_lock); return ret; @@ -8708,8 +8708,8 @@ static int xdp_btf_struct_access(struct bpf_verifier_log *log, flag); mutex_lock(&nf_conn_btf_access_lock); - if (nfct_bsa) - ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); mutex_unlock(&nf_conn_btf_access_lock); return ret; diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 77eb8e959f61..29c4efb3da5e 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -502,7 +502,7 @@ int register_nf_conntrack_bpf(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); if (!ret) { mutex_lock(&nf_conn_btf_access_lock); - nfct_bsa = _nf_conntrack_btf_struct_access; + nfct_btf_struct_access = _nf_conntrack_btf_struct_access; mutex_unlock(&nf_conn_btf_access_lock); } @@ -512,6 +512,6 @@ int register_nf_conntrack_bpf(void) void cleanup_nf_conntrack_bpf(void) { mutex_lock(&nf_conn_btf_access_lock); - nfct_bsa = NULL; + nfct_btf_struct_access = NULL; mutex_unlock(&nf_conn_btf_access_lock); } From fdf214978a71b2749d26f6da2b1d51d9ac23831d Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 20 Sep 2022 08:15:24 -0600 Subject: [PATCH 050/143] bpf: Move nf_conn extern declarations to filter.h We're seeing the following new warnings on netdev/build_32bit and netdev/build_allmodconfig_warn CI jobs: ../net/core/filter.c:8608:1: warning: symbol 'nf_conn_btf_access_lock' was not declared. Should it be static? ../net/core/filter.c:8611:5: warning: symbol 'nfct_bsa' was not declared. Should it be static? Fix by ensuring extern declaration is present while compiling filter.o. Fixes: 864b656f82cc ("bpf: Add support for writing to nf_conn:mark") Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/2bd2e0283df36d8a4119605878edb1838d144174.1663683114.git.dxu@dxuuu.xyz Signed-off-by: Martin KaFai Lau --- include/linux/filter.h | 6 ++++++ include/net/netfilter/nf_conntrack_bpf.h | 7 ------- net/netfilter/nf_conntrack_bpf.c | 1 + 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 75335432fcbc..98e28126c24b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -567,6 +567,12 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +extern struct mutex nf_conn_btf_access_lock; +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); + typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index 1199d4f8e019..c8b80add1142 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -4,7 +4,6 @@ #define _NF_CONNTRACK_BPF_H #include -#include #if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) @@ -12,12 +11,6 @@ extern int register_nf_conntrack_bpf(void); extern void cleanup_nf_conntrack_bpf(void); -extern struct mutex nf_conn_btf_access_lock; -extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); - #else static inline int register_nf_conntrack_bpf(void) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 29c4efb3da5e..67df64283aef 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include From bc069da65eec7b5113b40432930152c9c1cd7f88 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sun, 11 Sep 2022 21:03:30 +0800 Subject: [PATCH 051/143] samples/bpf: Replace blk_account_io_done() with __blk_account_io_done() Since commit be6bfe36db17 ("block: inline hot paths of blk_account_io_*()") blk_account_io_*() become inline functions. Signed-off-by: Rong Tao Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/tencent_1CC476835C219FACD84B6715F0D785517E07@qq.com --- samples/bpf/task_fd_query_kern.c | 2 +- samples/bpf/task_fd_query_user.c | 2 +- samples/bpf/tracex3_kern.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c index c821294e1774..186ac0a79c0a 100644 --- a/samples/bpf/task_fd_query_kern.c +++ b/samples/bpf/task_fd_query_kern.c @@ -10,7 +10,7 @@ int bpf_prog1(struct pt_regs *ctx) return 0; } -SEC("kretprobe/blk_account_io_done") +SEC("kretprobe/__blk_account_io_done") int bpf_prog2(struct pt_regs *ctx) { return 0; diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c index 424718c0872c..a33d74bd3a4b 100644 --- a/samples/bpf/task_fd_query_user.c +++ b/samples/bpf/task_fd_query_user.c @@ -348,7 +348,7 @@ int main(int argc, char **argv) /* test two functions in the corresponding *_kern.c file */ CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request", BPF_FD_TYPE_KPROBE)); - CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_done", + CHECK_AND_RET(test_debug_fs_kprobe(1, "__blk_account_io_done", BPF_FD_TYPE_KRETPROBE)); /* test nondebug fs kprobe */ diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c index 710a4410b2fb..bde6591cb20c 100644 --- a/samples/bpf/tracex3_kern.c +++ b/samples/bpf/tracex3_kern.c @@ -49,7 +49,7 @@ struct { __uint(max_entries, SLOTS); } lat_map SEC(".maps"); -SEC("kprobe/blk_account_io_done") +SEC("kprobe/__blk_account_io_done") int bpf_prog2(struct pt_regs *ctx) { long rq = PT_REGS_PARM1(ctx); From 7620bffbf72cd66a5d18e444a143b5b5989efa87 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Sat, 17 Sep 2022 16:48:09 +0800 Subject: [PATCH 052/143] libbpf: Fix NULL pointer exception in API btf_dump__dump_type_data We found that function btf_dump__dump_type_data can be called by the user as an API, but in this function, the `opts` parameter may be used as a null pointer.This causes `opts->indent_str` to trigger a NULL pointer exception. Fixes: 2ce8450ef5a3 ("libbpf: add bpf_object__open_{file, mem} w/ extensible opts") Signed-off-by: Xin Liu Signed-off-by: Weibin Kong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220917084809.30770-1-liuxin350@huawei.com --- tools/lib/bpf/btf_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 627edb5bb6de..4221f73a74d0 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -2385,7 +2385,7 @@ int btf_dump__dump_type_data(struct btf_dump *d, __u32 id, d->typed_dump->indent_lvl = OPTS_GET(opts, indent_level, 0); /* default indent string is a tab */ - if (!opts->indent_str) + if (!OPTS_GET(opts, indent_str, NULL)) d->typed_dump->indent_str[0] = '\t'; else libbpf_strlcpy(d->typed_dump->indent_str, opts->indent_str, From 3a74904ceff3ecdb9d6cc0844ed67df417968eb6 Mon Sep 17 00:00:00 2001 From: William Dean Date: Sat, 17 Sep 2022 16:42:48 +0800 Subject: [PATCH 053/143] bpf: simplify code in btf_parse_hdr It could directly return 'btf_check_sec_info' to simplify code. Signed-off-by: William Dean Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220917084248.3649-1-williamsukatube@163.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b3940c605aac..6ccd4f4d731e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4854,7 +4854,6 @@ static int btf_parse_hdr(struct btf_verifier_env *env) u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; struct btf *btf; - int err; btf = env->btf; btf_data_size = btf->data_size; @@ -4911,11 +4910,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return -EINVAL; } - err = btf_check_sec_info(env, btf_data_size); - if (err) - return err; - - return 0; + return btf_check_sec_info(env, btf_data_size); } static int btf_check_type_tags(struct btf_verifier_env *env, From 583c1f420173f7d84413a1a1fbf5109d798b4faa Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:57 -0500 Subject: [PATCH 054/143] bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type We want to support a ringbuf map type where samples are published from user-space, to be consumed by BPF programs. BPF currently supports a kernel -> user-space circular ring buffer via the BPF_MAP_TYPE_RINGBUF map type. We'll need to define a new map type for user-space -> kernel, as none of the helpers exported for BPF_MAP_TYPE_RINGBUF will apply to a user-space producer ring buffer, and we'll want to add one or more helper functions that would not apply for a kernel-producer ring buffer. This patch therefore adds a new BPF_MAP_TYPE_USER_RINGBUF map type definition. The map type is useless in its current form, as there is no way to access or use it for anything until we one or more BPF helpers. A follow-on patch will therefore add a new helper function that allows BPF programs to run callbacks on samples that are published to the ring buffer. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-2-void@manifault.com --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/ringbuf.c | 62 +++++++++++++++++-- kernel/bpf/verifier.c | 3 + .../bpf/bpftool/Documentation/bpftool-map.rst | 2 +- tools/bpf/bpftool/map.c | 2 +- tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/libbpf.c | 1 + 8 files changed, 65 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b9112b80171..2c6a4f2562a7 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3df78c56c1bf..e18c85324db6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index b483aea35f41..754e915748fb 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -38,10 +38,27 @@ struct bpf_ringbuf { struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; - /* Consumer and producer counters are put into separate pages to allow - * mapping consumer page as r/w, but restrict producer page to r/o. - * This protects producer position from being modified by user-space - * application and ruining in-kernel position tracking. + /* Consumer and producer counters are put into separate pages to + * allow each position to be mapped with different permissions. + * This prevents a user-space application from modifying the + * position and ruining in-kernel tracking. The permissions of the + * pages depend on who is producing samples: user-space or the + * kernel. + * + * Kernel-producer + * --------------- + * The producer position and data pages are mapped as r/o in + * userspace. For this approach, bits in the header of samples are + * used to signal to user-space, and to other producers, whether a + * sample is currently being written. + * + * User-space producer + * ------------------- + * Only the page containing the consumer position is mapped r/o in + * user-space. User-space producers also use bits of the header to + * communicate to the kernel, but the kernel must carefully check and + * validate each sample to ensure that they're correctly formatted, and + * fully contained within the ring buffer. */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); @@ -224,7 +241,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; @@ -242,6 +259,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + RINGBUF_PGOFF); } +static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + if (vma->vm_flags & VM_WRITE) { + if (vma->vm_pgoff == 0) + /* Disallow writable mappings to the consumer pointer, + * and allow writable mappings to both the producer + * position, and the ring buffer data itself. + */ + return -EPERM; + } else { + vma->vm_flags &= ~VM_MAYWRITE; + } + /* remap_vmalloc_range() checks size and offset constraints */ + return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); +} + static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { unsigned long cons_pos, prod_pos; @@ -269,7 +306,7 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, - .map_mmap = ringbuf_map_mmap, + .map_mmap = ringbuf_map_mmap_kern, .map_poll = ringbuf_map_poll, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, @@ -278,6 +315,19 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_btf_id = &ringbuf_map_btf_ids[0], }; +BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) +const struct bpf_map_ops user_ringbuf_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap_user, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_id = &user_ringbuf_map_btf_ids[0], +}; + /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, * calculate offset from record metadata to ring buffer in pages, rounded * down. This page offset is stored as part of record metadata and allows to diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8c6fbcd0afaf..83710b60e708 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6240,6 +6240,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; + case BPF_MAP_TYPE_USER_RINGBUF: + goto error; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -12635,6 +12637,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 7c188a598444..7f3b67a8b48f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -55,7 +55,7 @@ MAP COMMANDS | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** } +| | **task_storage** | **bloom_filter** | **user_ringbuf** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 38b6bc9c26c3..9a6ca9f31133 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1459,7 +1459,7 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage | bloom_filter }\n" + " task_storage | bloom_filter | user_ringbuf }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3df78c56c1bf..e18c85324db6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2ca30ccc774c..d480da05b6de 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -163,6 +163,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", }; static const char * const prog_type_name[] = { From 20571567384428dfc9fe5cf9f2e942e1df13c2dd Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:58 -0500 Subject: [PATCH 055/143] bpf: Add bpf_user_ringbuf_drain() helper In a prior change, we added a new BPF_MAP_TYPE_USER_RINGBUF map type which will allow user-space applications to publish messages to a ring buffer that is consumed by a BPF program in kernel-space. In order for this map-type to be useful, it will require a BPF helper function that BPF programs can invoke to drain samples from the ring buffer, and invoke callbacks on those samples. This change adds that capability via a new BPF helper function: bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) BPF programs may invoke this function to run callback_fn() on a series of samples in the ring buffer. callback_fn() has the following signature: long callback_fn(struct bpf_dynptr *dynptr, void *context); Samples are provided to the callback in the form of struct bpf_dynptr *'s, which the program can read using BPF helper functions for querying struct bpf_dynptr's. In order to support bpf_ringbuf_drain(), a new PTR_TO_DYNPTR register type is added to the verifier to reflect a dynptr that was allocated by a helper function and passed to a BPF program. Unlike PTR_TO_STACK dynptrs which are allocated on the stack by a BPF program, PTR_TO_DYNPTR dynptrs need not use reference tracking, as the BPF helper is trusted to properly free the dynptr before returning. The verifier currently only supports PTR_TO_DYNPTR registers that are also DYNPTR_TYPE_LOCAL. Note that while the corresponding user-space libbpf logic will be added in a subsequent patch, this patch does contain an implementation of the .map_poll() callback for BPF_MAP_TYPE_USER_RINGBUF maps. This .map_poll() callback guarantees that an epoll-waiting user-space producer will receive at least one event notification whenever at least one sample is drained in an invocation of bpf_user_ringbuf_drain(), provided that the function is not invoked with the BPF_RB_NO_WAKEUP flag. If the BPF_RB_FORCE_WAKEUP flag is provided, a wakeup notification is sent even if no sample was drained. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-3-void@manifault.com --- include/linux/bpf.h | 11 +- include/uapi/linux/bpf.h | 38 +++++++ kernel/bpf/helpers.c | 2 + kernel/bpf/ringbuf.c | 181 ++++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 61 ++++++++++- tools/include/uapi/linux/bpf.h | 38 +++++++ 6 files changed, 320 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e0dbe0c0a17e..33e543b86e1a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -451,7 +451,7 @@ enum bpf_type_flag { /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), - /* DYNPTR points to a ringbuf record. */ + /* DYNPTR points to a kernel-produced ringbuf record. */ DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), /* Size is known at compile time. */ @@ -656,6 +656,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_DYNPTR, /* reg points to a dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -1394,6 +1395,11 @@ struct bpf_array { #define BPF_MAP_CAN_READ BIT(0) #define BPF_MAP_CAN_WRITE BIT(1) +/* Maximum number of user-producer ring buffer samples that can be drained in + * a call to bpf_user_ringbuf_drain(). + */ +#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024) + static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) { u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); @@ -2495,6 +2501,7 @@ extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; +extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -2639,7 +2646,7 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, - /* Underlying data is a ringbuf record */ + /* Underlying data is a kernel-produced ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e18c85324db6..ead35f39f185 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5388,6 +5388,43 @@ union bpf_attr { * Return * Current *ktime*. * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5599,6 +5636,7 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ FN(ktime_get_tai_ns), \ + FN(user_ringbuf_drain), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 41aeaf3862ec..cb5564c77482 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1659,6 +1659,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; + case BPF_FUNC_user_ringbuf_drain: + return &bpf_user_ringbuf_drain_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 754e915748fb..9e832acf4692 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -38,6 +38,22 @@ struct bpf_ringbuf { struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; + /* For user-space producer ring buffers, an atomic_t busy bit is used + * to synchronize access to the ring buffers in the kernel, rather than + * the spinlock that is used for kernel-producer ring buffers. This is + * done because the ring buffer must hold a lock across a BPF program's + * callback: + * + * __bpf_user_ringbuf_peek() // lock acquired + * -> program callback_fn() + * -> __bpf_user_ringbuf_sample_release() // lock released + * + * It is unsafe and incorrect to hold an IRQ spinlock across what could + * be a long execution window, so we instead simply disallow concurrent + * access to the ring buffer by kernel consumers, and return -EBUSY from + * __bpf_user_ringbuf_peek() if the busy bit is held by another task. + */ + atomic_t busy ____cacheline_aligned_in_smp; /* Consumer and producer counters are put into separate pages to * allow each position to be mapped with different permissions. * This prevents a user-space application from modifying the @@ -153,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) return NULL; spin_lock_init(&rb->spinlock); + atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -288,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) return prod_pos - cons_pos; } -static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, - struct poll_table_struct *pts) +static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) +{ + return rb->mask + 1; +} + +static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; @@ -301,13 +323,26 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } +static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) + return EPOLLOUT | EPOLLWRNORM; + return 0; +} + BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_kern, - .map_poll = ringbuf_map_poll, + .map_poll = ringbuf_map_poll_kern, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, @@ -321,6 +356,7 @@ const struct bpf_map_ops user_ringbuf_map_ops = { .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_user, + .map_poll = ringbuf_map_poll_user, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, @@ -362,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); - if (len > rb->mask + 1) + if (len > ringbuf_total_data_sz(rb)) return NULL; cons_pos = smp_load_acquire(&rb->consumer_pos); @@ -509,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) case BPF_RB_AVAIL_DATA: return ringbuf_avail_data_sz(rb); case BPF_RB_RING_SIZE: - return rb->mask + 1; + return ringbuf_total_data_sz(rb); case BPF_RB_CONS_POS: return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: @@ -603,3 +639,138 @@ const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; + +static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) +{ + int err; + u32 hdr_len, sample_len, total_len, flags, *hdr; + u64 cons_pos, prod_pos; + + /* Synchronizes with smp_store_release() in user-space producer. */ + prod_pos = smp_load_acquire(&rb->producer_pos); + if (prod_pos % 8) + return -EINVAL; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ + cons_pos = smp_load_acquire(&rb->consumer_pos); + if (cons_pos >= prod_pos) + return -ENODATA; + + hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); + /* Synchronizes with smp_store_release() in user-space producer. */ + hdr_len = smp_load_acquire(hdr); + flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); + sample_len = hdr_len & ~flags; + total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); + + /* The sample must fit within the region advertised by the producer position. */ + if (total_len > prod_pos - cons_pos) + return -EINVAL; + + /* The sample must fit within the data region of the ring buffer. */ + if (total_len > ringbuf_total_data_sz(rb)) + return -E2BIG; + + /* The sample must fit into a struct bpf_dynptr. */ + err = bpf_dynptr_check_size(sample_len); + if (err) + return -E2BIG; + + if (flags & BPF_RINGBUF_DISCARD_BIT) { + /* If the discard bit is set, the sample should be skipped. + * + * Update the consumer pos, and return -EAGAIN so the caller + * knows to skip this sample and try to read the next one. + */ + smp_store_release(&rb->consumer_pos, cons_pos + total_len); + return -EAGAIN; + } + + if (flags & BPF_RINGBUF_BUSY_BIT) + return -ENODATA; + + *sample = (void *)((uintptr_t)rb->data + + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); + *size = sample_len; + return 0; +} + +static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) +{ + u64 consumer_pos; + u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + + /* Using smp_load_acquire() is unnecessary here, as the busy-bit + * prevents another task from writing to consumer_pos after it was read + * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). + */ + consumer_pos = rb->consumer_pos; + /* Synchronizes with smp_load_acquire() in user-space producer. */ + smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); +} + +BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, + void *, callback_fn, void *, callback_ctx, u64, flags) +{ + struct bpf_ringbuf *rb; + long samples, discarded_samples = 0, ret = 0; + bpf_callback_t callback = (bpf_callback_t)callback_fn; + u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; + int busy = 0; + + if (unlikely(flags & ~wakeup_flags)) + return -EINVAL; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + /* If another consumer is already consuming a sample, wait for them to finish. */ + if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) + return -EBUSY; + + for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { + int err; + u32 size; + void *sample; + struct bpf_dynptr_kern dynptr; + + err = __bpf_user_ringbuf_peek(rb, &sample, &size); + if (err) { + if (err == -ENODATA) { + break; + } else if (err == -EAGAIN) { + discarded_samples++; + continue; + } else { + ret = err; + goto schedule_work_return; + } + } + + bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); + ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); + __bpf_user_ringbuf_sample_release(rb, size, flags); + } + ret = samples - discarded_samples; + +schedule_work_return: + /* Prevent the clearing of the busy-bit from being reordered before the + * storing of any rb consumer or producer positions. + */ + smp_mb__before_atomic(); + atomic_set(&rb->busy, 0); + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) + irq_work_queue(&rb->work); + return ret; +} + +const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { + .func = bpf_user_ringbuf_drain, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 83710b60e708..c76fa45a5906 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -563,6 +563,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", + [PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -5688,6 +5689,12 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types dynptr_types = { + .types = { + PTR_TO_STACK, + PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL, + } +}; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -5714,7 +5721,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, [ARG_PTR_TO_KPTR] = &kptr_types, - [ARG_PTR_TO_DYNPTR] = &stack_ptr_types, + [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -6066,6 +6073,13 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: + /* We only need to check for initialized / uninitialized helper + * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the + * assumption is that if it is, that a helper function + * initialized the dynptr on behalf of the BPF program. + */ + if (base_type(reg->type) == PTR_TO_DYNPTR) + break; if (arg_type & MEM_UNINIT) { if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); @@ -6241,7 +6255,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_USER_RINGBUF: - goto error; + if (func_id != BPF_FUNC_user_ringbuf_drain) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -6361,6 +6377,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; + case BPF_FUNC_user_ringbuf_drain: + if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF) + goto error; + break; case BPF_FUNC_get_stackid: if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; @@ -6887,6 +6907,29 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void + * callback_ctx, u64 flags); + * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx); + */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); + callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -7346,12 +7389,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn case BPF_FUNC_dynptr_data: for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { if (arg_type_is_dynptr(fn->arg_type[i])) { + struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; + if (meta.ref_obj_id) { verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); return -EFAULT; } - /* Find the id of the dynptr we're tracking the reference of */ - meta.ref_obj_id = stack_slot_get_id(env, ®s[BPF_REG_1 + i]); + + if (base_type(reg->type) != PTR_TO_DYNPTR) + /* Find the id of the dynptr we're + * tracking the reference of + */ + meta.ref_obj_id = stack_slot_get_id(env, reg); break; } } @@ -7360,6 +7409,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } break; + case BPF_FUNC_user_ringbuf_drain: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_user_ringbuf_callback_state); + break; } if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e18c85324db6..ead35f39f185 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5388,6 +5388,43 @@ union bpf_attr { * Return * Current *ktime*. * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5599,6 +5636,7 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ FN(ktime_get_tai_ns), \ + FN(user_ringbuf_drain), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper From b66ccae01f1ddce47fe2c7f393a3a5c5ab3d7f06 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:59 -0500 Subject: [PATCH 056/143] bpf: Add libbpf logic for user-space ring buffer Now that all of the logic is in place in the kernel to support user-space produced ring buffers, we can add the user-space logic to libbpf. This patch therefore adds the following public symbols to libbpf: struct user_ring_buffer * user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts); void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size); void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms); void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample); void user_ring_buffer__discard(struct user_ring_buffer *rb, void user_ring_buffer__free(struct user_ring_buffer *rb); A user-space producer must first create a struct user_ring_buffer * object with user_ring_buffer__new(), and can then reserve samples in the ring buffer using one of the following two symbols: void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size); void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms); With user_ring_buffer__reserve(), a pointer to a 'size' region of the ring buffer will be returned if sufficient space is available in the buffer. user_ring_buffer__reserve_blocking() provides similar semantics, but will block for up to 'timeout_ms' in epoll_wait if there is insufficient space in the buffer. This function has the guarantee from the kernel that it will receive at least one event-notification per invocation to bpf_ringbuf_drain(), provided that at least one sample is drained, and the BPF program did not pass the BPF_RB_NO_WAKEUP flag to bpf_ringbuf_drain(). Once a sample is reserved, it must either be committed to the ring buffer with user_ring_buffer__submit(), or discarded with user_ring_buffer__discard(). Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-4-void@manifault.com --- tools/lib/bpf/libbpf.c | 10 +- tools/lib/bpf/libbpf.h | 107 +++++++++++++ tools/lib/bpf/libbpf.map | 10 ++ tools/lib/bpf/libbpf_probes.c | 1 + tools/lib/bpf/libbpf_version.h | 2 +- tools/lib/bpf/ringbuf.c | 271 +++++++++++++++++++++++++++++++++ 6 files changed, 398 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d480da05b6de..67bc18506150 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2373,6 +2373,12 @@ static size_t adjust_ringbuf_sz(size_t sz) return sz; } +static bool map_is_ringbuf(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_RINGBUF || + map->def.type == BPF_MAP_TYPE_USER_RINGBUF; +} + static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) { map->def.type = def->map_type; @@ -2387,7 +2393,7 @@ static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def map->btf_value_type_id = def->value_type_id; /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ - if (map->def.type == BPF_MAP_TYPE_RINGBUF) + if (map_is_ringbuf(map)) map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); if (def->parts & MAP_DEF_MAP_TYPE) @@ -4370,7 +4376,7 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) map->def.max_entries = max_entries; /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ - if (map->def.type == BPF_MAP_TYPE_RINGBUF) + if (map_is_ringbuf(map)) map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); return 0; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 88a1ac34b12a..e2d8c17f2e85 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1011,6 +1011,7 @@ LIBBPF_API int bpf_tc_query(const struct bpf_tc_hook *hook, /* Ring buffer APIs */ struct ring_buffer; +struct user_ring_buffer; typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size); @@ -1030,6 +1031,112 @@ LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); +struct user_ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ +}; + +#define user_ring_buffer_opts__last_field sz + +/* @brief **user_ring_buffer__new()** creates a new instance of a user ring + * buffer. + * + * @param map_fd A file descriptor to a BPF_MAP_TYPE_USER_RINGBUF map. + * @param opts Options for how the ring buffer should be created. + * @return A user ring buffer on success; NULL and errno being set on a + * failure. + */ +LIBBPF_API struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts); + +/* @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the + * user ring buffer. + * @param rb A pointer to a user ring buffer. + * @param size The size of the sample, in bytes. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize accessing + * this function if there are multiple producers. If a size is requested that + * is larger than the size of the entire ring buffer, errno will be set to + * E2BIG and NULL is returned. If the ring buffer could accommodate the size, + * but currently does not have enough space, errno is set to ENOSPC and NULL is + * returned. + * + * After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the kernel. Otherwise, + * the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size); + +/* @brief **user_ring_buffer__reserve_blocking()** reserves a record in the + * ring buffer, possibly blocking for up to @timeout_ms until a sample becomes + * available. + * @param rb The user ring buffer. + * @param size The size of the sample, in bytes. + * @param timeout_ms The amount of time, in milliseconds, for which the caller + * should block when waiting for a sample. -1 causes the caller to block + * indefinitely. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize + * accessing this function if there are multiple producers + * + * If **timeout_ms** is -1, the function will block indefinitely until a sample + * becomes available. Otherwise, **timeout_ms** must be non-negative, or errno + * is set to EINVAL, and NULL is returned. If **timeout_ms** is 0, no blocking + * will occur and the function will return immediately after attempting to + * reserve a sample. + * + * If **size** is larger than the size of the entire ring buffer, errno is set + * to E2BIG and NULL is returned. If the ring buffer could accommodate + * **size**, but currently does not have enough space, the caller will block + * until at most **timeout_ms** has elapsed. If insufficient space is available + * at that time, errno is set to ENOSPC, and NULL is returned. + * + * The kernel guarantees that it will wake up this thread to check if + * sufficient space is available in the ring buffer at least once per + * invocation of the **bpf_ringbuf_drain()** helper function, provided that at + * least one sample is consumed, and the BPF program did not invoke the + * function with BPF_RB_NO_WAKEUP. A wakeup may occur sooner than that, but the + * kernel does not guarantee this. If the helper function is invoked with + * BPF_RB_FORCE_WAKEUP, a wakeup event will be sent even if no sample is + * consumed. + * + * When a sample of size **size** is found within **timeout_ms**, a pointer to + * the sample is returned. After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the ring buffer. + * Otherwise, the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, + __u32 size, + int timeout_ms); + +/* @brief **user_ring_buffer__submit()** submits a previously reserved sample + * into the ring buffer. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample); + +/* @brief **user_ring_buffer__discard()** discards a previously reserved sample. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample); + +/* @brief **user_ring_buffer__free()** frees a ring buffer that was previously + * created with **user_ring_buffer__new()**. + * @param rb The user ring buffer being freed. + */ +LIBBPF_API void user_ring_buffer__free(struct user_ring_buffer *rb); + /* Perf buffer APIs */ struct perf_buffer; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 2b928dc21af0..c1d6aa7c82b6 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -368,3 +368,13 @@ LIBBPF_1.0.0 { libbpf_bpf_prog_type_str; perf_buffer__buffer; }; + +LIBBPF_1.1.0 { + global: + user_ring_buffer__discard; + user_ring_buffer__free; + user_ring_buffer__new; + user_ring_buffer__reserve; + user_ring_buffer__reserve_blocking; + user_ring_buffer__submit; +} LIBBPF_1.0.0; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 6d495656f554..f3a8e8e74eb8 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -231,6 +231,7 @@ static int probe_map_create(enum bpf_map_type map_type) return btf_fd; break; case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: key_size = 0; value_size = 0; max_entries = 4096; diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index 2fb2f4290080..e944f5bce728 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 0 +#define LIBBPF_MINOR_VERSION 1 #endif /* __LIBBPF_VERSION_H */ diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 8bc117bcc7bc..d285171d4b69 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "libbpf.h" #include "libbpf_internal.h" @@ -39,6 +40,23 @@ struct ring_buffer { int ring_cnt; }; +struct user_ring_buffer { + struct epoll_event event; + unsigned long *consumer_pos; + unsigned long *producer_pos; + void *data; + unsigned long mask; + size_t page_size; + int map_fd; + int epoll_fd; +}; + +/* 8-byte ring buffer header structure */ +struct ringbuf_hdr { + __u32 len; + __u32 pad; +}; + static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) { if (r->consumer_pos) { @@ -300,3 +318,256 @@ int ring_buffer__epoll_fd(const struct ring_buffer *rb) { return rb->epoll_fd; } + +static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) +{ + if (rb->consumer_pos) { + munmap(rb->consumer_pos, rb->page_size); + rb->consumer_pos = NULL; + } + if (rb->producer_pos) { + munmap(rb->producer_pos, rb->page_size + 2 * (rb->mask + 1)); + rb->producer_pos = NULL; + } +} + +void user_ring_buffer__free(struct user_ring_buffer *rb) +{ + if (!rb) + return; + + user_ringbuf_unmap_ring(rb); + + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb); +} + +static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + void *tmp; + struct epoll_event *rb_epoll; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_obj_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err); + return err; + } + + if (info.type != BPF_MAP_TYPE_USER_RINGBUF) { + pr_warn("user ringbuf: map fd=%d is not BPF_MAP_TYPE_USER_RINGBUF\n", map_fd); + return -EINVAL; + } + + rb->map_fd = map_fd; + rb->mask = info.max_entries - 1; + + /* Map read-only consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return err; + } + rb->consumer_pos = tmp; + + /* Map read-write the producer page and data pages. We map the data + * region as twice the total size of the ring buffer to allow the + * simple reading and writing of samples that wrap around the end of + * the buffer. See the kernel implementation for details. + */ + tmp = mmap(NULL, rb->page_size + 2 * info.max_entries, + PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return err; + } + + rb->producer_pos = tmp; + rb->data = tmp + rb->page_size; + + rb_epoll = &rb->event; + rb_epoll->events = EPOLLOUT; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) { + err = -errno; + pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err); + return err; + } + + return 0; +} + +struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts) +{ + struct user_ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, user_ring_buffer_opts)) + return errno = EINVAL, NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return errno = ENOMEM, NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("user ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = user_ringbuf_map(rb, map_fd); + if (err) + goto err_out; + + return rb; + +err_out: + user_ring_buffer__free(rb); + return errno = -err, NULL; +} + +static void user_ringbuf_commit(struct user_ring_buffer *rb, void *sample, bool discard) +{ + __u32 new_len; + struct ringbuf_hdr *hdr; + uintptr_t hdr_offset; + + hdr_offset = rb->mask + 1 + (sample - rb->data) - BPF_RINGBUF_HDR_SZ; + hdr = rb->data + (hdr_offset & rb->mask); + + new_len = hdr->len & ~BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + __atomic_exchange_n(&hdr->len, new_len, __ATOMIC_ACQ_REL); +} + +void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, true); +} + +void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, false); +} + +void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size) +{ + __u32 avail_size, total_size, max_size; + /* 64-bit to avoid overflow in case of extreme application behavior */ + __u64 cons_pos, prod_pos; + struct ringbuf_hdr *hdr; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + cons_pos = smp_load_acquire(rb->consumer_pos); + /* Synchronizes with smp_store_release() in user_ringbuf_commit() */ + prod_pos = smp_load_acquire(rb->producer_pos); + + max_size = rb->mask + 1; + avail_size = max_size - (prod_pos - cons_pos); + /* Round up total size to a multiple of 8. */ + total_size = (size + BPF_RINGBUF_HDR_SZ + 7) / 8 * 8; + + if (total_size > max_size) + return errno = E2BIG, NULL; + + if (avail_size < total_size) + return errno = ENOSPC, NULL; + + hdr = rb->data + (prod_pos & rb->mask); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pad = 0; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + smp_store_release(rb->producer_pos, prod_pos + total_size); + + return (void *)rb->data + ((prod_pos + BPF_RINGBUF_HDR_SZ) & rb->mask); +} + +static __u64 ns_elapsed_timespec(const struct timespec *start, const struct timespec *end) +{ + __u64 start_ns, end_ns, ns_per_s = 1000000000; + + start_ns = (__u64)start->tv_sec * ns_per_s + start->tv_nsec; + end_ns = (__u64)end->tv_sec * ns_per_s + end->tv_nsec; + + return end_ns - start_ns; +} + +void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms) +{ + void *sample; + int err, ms_remaining = timeout_ms; + struct timespec start; + + if (timeout_ms < 0 && timeout_ms != -1) + return errno = EINVAL, NULL; + + if (timeout_ms != -1) { + err = clock_gettime(CLOCK_MONOTONIC, &start); + if (err) + return NULL; + } + + do { + int cnt, ms_elapsed; + struct timespec curr; + __u64 ns_per_ms = 1000000; + + sample = user_ring_buffer__reserve(rb, size); + if (sample) + return sample; + else if (errno != ENOSPC) + return NULL; + + /* The kernel guarantees at least one event notification + * delivery whenever at least one sample is drained from the + * ring buffer in an invocation to bpf_ringbuf_drain(). Other + * additional events may be delivered at any time, but only one + * event is guaranteed per bpf_ringbuf_drain() invocation, + * provided that a sample is drained, and the BPF program did + * not pass BPF_RB_NO_WAKEUP to bpf_ringbuf_drain(). If + * BPF_RB_FORCE_WAKEUP is passed to bpf_ringbuf_drain(), a + * wakeup event will be delivered even if no samples are + * drained. + */ + cnt = epoll_wait(rb->epoll_fd, &rb->event, 1, ms_remaining); + if (cnt < 0) + return NULL; + + if (timeout_ms == -1) + continue; + + err = clock_gettime(CLOCK_MONOTONIC, &curr); + if (err) + return NULL; + + ms_elapsed = ns_elapsed_timespec(&start, &curr) / ns_per_ms; + ms_remaining = timeout_ms - ms_elapsed; + } while (ms_remaining > 0); + + /* Try one more time to reserve a sample after the specified timeout has elapsed. */ + return user_ring_buffer__reserve(rb, size); +} From e5a9df51c74671cfe15af1d50e5f508bd3efddab Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:01:00 -0500 Subject: [PATCH 057/143] selftests/bpf: Add selftests validating the user ringbuf This change includes selftests that validate the expected behavior and APIs of the new BPF_MAP_TYPE_USER_RINGBUF map type. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-5-void@manifault.com --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + .../selftests/bpf/prog_tests/user_ringbuf.c | 754 ++++++++++++++++++ .../selftests/bpf/progs/test_user_ringbuf.h | 35 + .../selftests/bpf/progs/user_ringbuf_fail.c | 177 ++++ .../bpf/progs/user_ringbuf_success.c | 218 +++++ 5 files changed, 1185 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/user_ringbuf.c create mode 100644 tools/testing/selftests/bpf/progs/test_user_ringbuf.h create mode 100644 tools/testing/selftests/bpf/progs/user_ringbuf_fail.c create mode 100644 tools/testing/selftests/bpf/progs/user_ringbuf_success.c diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 168c5b287b5c..981c2be922f4 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -71,3 +71,4 @@ cb_refs # expected error message unexpected err cgroup_hierarchical_stats # JIT does not support calling kernel function (kfunc) htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) tracing_struct # failed to auto-attach: -524 (trampoline) +user_ringbuf # failed to find kernel BTF type ID of '__s390x_sys_prctl': -3 (?) diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c new file mode 100644 index 000000000000..02b18d018b36 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c @@ -0,0 +1,754 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "user_ringbuf_fail.skel.h" +#include "user_ringbuf_success.skel.h" + +#include "../progs/test_user_ringbuf.h" + +static size_t log_buf_sz = 1 << 20; /* 1 MB */ +static char obj_log_buf[1048576]; +static const long c_sample_size = sizeof(struct sample) + BPF_RINGBUF_HDR_SZ; +static const long c_ringbuf_size = 1 << 12; /* 1 small page */ +static const long c_max_entries = c_ringbuf_size / c_sample_size; + +static void drain_current_samples(void) +{ + syscall(__NR_getpgid); +} + +static int write_samples(struct user_ring_buffer *ringbuf, uint32_t num_samples) +{ + int i, err = 0; + + /* Write some number of samples to the ring buffer. */ + for (i = 0; i < num_samples; i++) { + struct sample *entry; + int read; + + entry = user_ring_buffer__reserve(ringbuf, sizeof(*entry)); + if (!entry) { + err = -errno; + goto done; + } + + entry->pid = getpid(); + entry->seq = i; + entry->value = i * i; + + read = snprintf(entry->comm, sizeof(entry->comm), "%u", i); + if (read <= 0) { + /* Assert on the error path to avoid spamming logs with + * mostly success messages. + */ + ASSERT_GT(read, 0, "snprintf_comm"); + err = read; + user_ring_buffer__discard(ringbuf, entry); + goto done; + } + + user_ring_buffer__submit(ringbuf, entry); + } + +done: + drain_current_samples(); + + return err; +} + +static struct user_ringbuf_success *open_load_ringbuf_skel(void) +{ + struct user_ringbuf_success *skel; + int err; + + skel = user_ringbuf_success__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return NULL; + + err = bpf_map__set_max_entries(skel->maps.user_ringbuf, c_ringbuf_size); + if (!ASSERT_OK(err, "set_max_entries")) + goto cleanup; + + err = bpf_map__set_max_entries(skel->maps.kernel_ringbuf, c_ringbuf_size); + if (!ASSERT_OK(err, "set_max_entries")) + goto cleanup; + + err = user_ringbuf_success__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + return skel; + +cleanup: + user_ringbuf_success__destroy(skel); + return NULL; +} + +static void test_user_ringbuf_mappings(void) +{ + int err, rb_fd; + int page_size = getpagesize(); + void *mmap_ptr; + struct user_ringbuf_success *skel; + + skel = open_load_ringbuf_skel(); + if (!skel) + return; + + rb_fd = bpf_map__fd(skel->maps.user_ringbuf); + /* cons_pos can be mapped R/O, can't add +X with mprotect. */ + mmap_ptr = mmap(NULL, page_size, PROT_READ, MAP_SHARED, rb_fd, 0); + ASSERT_OK_PTR(mmap_ptr, "ro_cons_pos"); + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_WRITE), "write_cons_pos_protect"); + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_cons_pos_protect"); + ASSERT_ERR_PTR(mremap(mmap_ptr, 0, 4 * page_size, MREMAP_MAYMOVE), "wr_prod_pos"); + err = -errno; + ASSERT_ERR(err, "wr_prod_pos_err"); + ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_ro_cons"); + + /* prod_pos can be mapped RW, can't add +X with mprotect. */ + mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, + rb_fd, page_size); + ASSERT_OK_PTR(mmap_ptr, "rw_prod_pos"); + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_prod_pos_protect"); + err = -errno; + ASSERT_ERR(err, "wr_prod_pos_err"); + ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw_prod"); + + /* data pages can be mapped RW, can't add +X with mprotect. */ + mmap_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, + 2 * page_size); + ASSERT_OK_PTR(mmap_ptr, "rw_data"); + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_data_protect"); + err = -errno; + ASSERT_ERR(err, "exec_data_err"); + ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw_data"); + + user_ringbuf_success__destroy(skel); +} + +static int load_skel_create_ringbufs(struct user_ringbuf_success **skel_out, + struct ring_buffer **kern_ringbuf_out, + ring_buffer_sample_fn callback, + struct user_ring_buffer **user_ringbuf_out) +{ + struct user_ringbuf_success *skel; + struct ring_buffer *kern_ringbuf = NULL; + struct user_ring_buffer *user_ringbuf = NULL; + int err = -ENOMEM, rb_fd; + + skel = open_load_ringbuf_skel(); + if (!skel) + return err; + + /* only trigger BPF program for current process */ + skel->bss->pid = getpid(); + + if (kern_ringbuf_out) { + rb_fd = bpf_map__fd(skel->maps.kernel_ringbuf); + kern_ringbuf = ring_buffer__new(rb_fd, callback, skel, NULL); + if (!ASSERT_OK_PTR(kern_ringbuf, "kern_ringbuf_create")) + goto cleanup; + + *kern_ringbuf_out = kern_ringbuf; + } + + if (user_ringbuf_out) { + rb_fd = bpf_map__fd(skel->maps.user_ringbuf); + user_ringbuf = user_ring_buffer__new(rb_fd, NULL); + if (!ASSERT_OK_PTR(user_ringbuf, "user_ringbuf_create")) + goto cleanup; + + *user_ringbuf_out = user_ringbuf; + ASSERT_EQ(skel->bss->read, 0, "no_reads_after_load"); + } + + err = user_ringbuf_success__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + *skel_out = skel; + return 0; + +cleanup: + if (kern_ringbuf_out) + *kern_ringbuf_out = NULL; + if (user_ringbuf_out) + *user_ringbuf_out = NULL; + ring_buffer__free(kern_ringbuf); + user_ring_buffer__free(user_ringbuf); + user_ringbuf_success__destroy(skel); + return err; +} + +static int load_skel_create_user_ringbuf(struct user_ringbuf_success **skel_out, + struct user_ring_buffer **ringbuf_out) +{ + return load_skel_create_ringbufs(skel_out, NULL, NULL, ringbuf_out); +} + +static void manually_write_test_invalid_sample(struct user_ringbuf_success *skel, + __u32 size, __u64 producer_pos, int err) +{ + void *data_ptr; + __u64 *producer_pos_ptr; + int rb_fd, page_size = getpagesize(); + + rb_fd = bpf_map__fd(skel->maps.user_ringbuf); + + ASSERT_EQ(skel->bss->read, 0, "num_samples_before_bad_sample"); + + /* Map the producer_pos as RW. */ + producer_pos_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, rb_fd, page_size); + ASSERT_OK_PTR(producer_pos_ptr, "producer_pos_ptr"); + + /* Map the data pages as RW. */ + data_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, 2 * page_size); + ASSERT_OK_PTR(data_ptr, "rw_data"); + + memset(data_ptr, 0, BPF_RINGBUF_HDR_SZ); + *(__u32 *)data_ptr = size; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in the kernel. */ + smp_store_release(producer_pos_ptr, producer_pos + BPF_RINGBUF_HDR_SZ); + + drain_current_samples(); + ASSERT_EQ(skel->bss->read, 0, "num_samples_after_bad_sample"); + ASSERT_EQ(skel->bss->err, err, "err_after_bad_sample"); + + ASSERT_OK(munmap(producer_pos_ptr, page_size), "unmap_producer_pos"); + ASSERT_OK(munmap(data_ptr, page_size), "unmap_data_ptr"); +} + +static void test_user_ringbuf_post_misaligned(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + __u32 size = (1 << 5) + 7; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "misaligned_skel")) + return; + + manually_write_test_invalid_sample(skel, size, size, -EINVAL); + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_post_producer_wrong_offset(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + __u32 size = (1 << 5); + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "wrong_offset_skel")) + return; + + manually_write_test_invalid_sample(skel, size, size - 8, -EINVAL); + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_post_larger_than_ringbuf_sz(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + __u32 size = c_ringbuf_size; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "huge_sample_skel")) + return; + + manually_write_test_invalid_sample(skel, size, size, -E2BIG); + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_basic(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "ringbuf_basic_skel")) + return; + + ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before"); + + err = write_samples(ringbuf, 2); + if (!ASSERT_OK(err, "write_samples")) + goto cleanup; + + ASSERT_EQ(skel->bss->read, 2, "num_samples_read_after"); + +cleanup: + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_sample_full_ring_buffer(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + void *sample; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "ringbuf_full_sample_skel")) + return; + + sample = user_ring_buffer__reserve(ringbuf, c_ringbuf_size - BPF_RINGBUF_HDR_SZ); + if (!ASSERT_OK_PTR(sample, "full_sample")) + goto cleanup; + + user_ring_buffer__submit(ringbuf, sample); + ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before"); + drain_current_samples(); + ASSERT_EQ(skel->bss->read, 1, "num_samples_read_after"); + +cleanup: + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_post_alignment_autoadjust(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + struct sample *sample; + int err; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "ringbuf_align_autoadjust_skel")) + return; + + /* libbpf should automatically round any sample up to an 8-byte alignment. */ + sample = user_ring_buffer__reserve(ringbuf, sizeof(*sample) + 1); + ASSERT_OK_PTR(sample, "reserve_autoaligned"); + user_ring_buffer__submit(ringbuf, sample); + + ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before"); + drain_current_samples(); + ASSERT_EQ(skel->bss->read, 1, "num_samples_read_after"); + + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_overfill(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (err) + return; + + err = write_samples(ringbuf, c_max_entries * 5); + ASSERT_ERR(err, "write_samples"); + ASSERT_EQ(skel->bss->read, c_max_entries, "max_entries"); + + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_discards_properly_ignored(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err, num_discarded = 0; + __u64 *token; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (err) + return; + + ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before"); + + while (1) { + /* Write samples until the buffer is full. */ + token = user_ring_buffer__reserve(ringbuf, sizeof(*token)); + if (!token) + break; + + user_ring_buffer__discard(ringbuf, token); + num_discarded++; + } + + if (!ASSERT_GE(num_discarded, 0, "num_discarded")) + goto cleanup; + + /* Should not read any samples, as they are all discarded. */ + ASSERT_EQ(skel->bss->read, 0, "num_pre_kick"); + drain_current_samples(); + ASSERT_EQ(skel->bss->read, 0, "num_post_kick"); + + /* Now that the ring buffer has been drained, we should be able to + * reserve another token. + */ + token = user_ring_buffer__reserve(ringbuf, sizeof(*token)); + + if (!ASSERT_OK_PTR(token, "new_token")) + goto cleanup; + + user_ring_buffer__discard(ringbuf, token); +cleanup: + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_loop(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + uint32_t total_samples = 8192; + uint32_t remaining_samples = total_samples; + int err; + + BUILD_BUG_ON(total_samples <= c_max_entries); + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (err) + return; + + do { + uint32_t curr_samples; + + curr_samples = remaining_samples > c_max_entries + ? c_max_entries : remaining_samples; + err = write_samples(ringbuf, curr_samples); + if (err != 0) { + /* Assert inside of if statement to avoid flooding logs + * on the success path. + */ + ASSERT_OK(err, "write_samples"); + goto cleanup; + } + + remaining_samples -= curr_samples; + ASSERT_EQ(skel->bss->read, total_samples - remaining_samples, + "current_batched_entries"); + } while (remaining_samples > 0); + ASSERT_EQ(skel->bss->read, total_samples, "total_batched_entries"); + +cleanup: + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static int send_test_message(struct user_ring_buffer *ringbuf, + enum test_msg_op op, s64 operand_64, + s32 operand_32) +{ + struct test_msg *msg; + + msg = user_ring_buffer__reserve(ringbuf, sizeof(*msg)); + if (!msg) { + /* Assert on the error path to avoid spamming logs with mostly + * success messages. + */ + ASSERT_OK_PTR(msg, "reserve_msg"); + return -ENOMEM; + } + + msg->msg_op = op; + + switch (op) { + case TEST_MSG_OP_INC64: + case TEST_MSG_OP_MUL64: + msg->operand_64 = operand_64; + break; + case TEST_MSG_OP_INC32: + case TEST_MSG_OP_MUL32: + msg->operand_32 = operand_32; + break; + default: + PRINT_FAIL("Invalid operand %d\n", op); + user_ring_buffer__discard(ringbuf, msg); + return -EINVAL; + } + + user_ring_buffer__submit(ringbuf, msg); + + return 0; +} + +static void kick_kernel_read_messages(void) +{ + syscall(__NR_prctl); +} + +static int handle_kernel_msg(void *ctx, void *data, size_t len) +{ + struct user_ringbuf_success *skel = ctx; + struct test_msg *msg = data; + + switch (msg->msg_op) { + case TEST_MSG_OP_INC64: + skel->bss->user_mutated += msg->operand_64; + return 0; + case TEST_MSG_OP_INC32: + skel->bss->user_mutated += msg->operand_32; + return 0; + case TEST_MSG_OP_MUL64: + skel->bss->user_mutated *= msg->operand_64; + return 0; + case TEST_MSG_OP_MUL32: + skel->bss->user_mutated *= msg->operand_32; + return 0; + default: + fprintf(stderr, "Invalid operand %d\n", msg->msg_op); + return -EINVAL; + } +} + +static void drain_kernel_messages_buffer(struct ring_buffer *kern_ringbuf, + struct user_ringbuf_success *skel) +{ + int cnt; + + cnt = ring_buffer__consume(kern_ringbuf); + ASSERT_EQ(cnt, 8, "consume_kern_ringbuf"); + ASSERT_OK(skel->bss->err, "consume_kern_ringbuf_err"); +} + +static void test_user_ringbuf_msg_protocol(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *user_ringbuf; + struct ring_buffer *kern_ringbuf; + int err, i; + __u64 expected_kern = 0; + + err = load_skel_create_ringbufs(&skel, &kern_ringbuf, handle_kernel_msg, &user_ringbuf); + if (!ASSERT_OK(err, "create_ringbufs")) + return; + + for (i = 0; i < 64; i++) { + enum test_msg_op op = i % TEST_MSG_OP_NUM_OPS; + __u64 operand_64 = TEST_OP_64; + __u32 operand_32 = TEST_OP_32; + + err = send_test_message(user_ringbuf, op, operand_64, operand_32); + if (err) { + /* Only assert on a failure to avoid spamming success logs. */ + ASSERT_OK(err, "send_test_message"); + goto cleanup; + } + + switch (op) { + case TEST_MSG_OP_INC64: + expected_kern += operand_64; + break; + case TEST_MSG_OP_INC32: + expected_kern += operand_32; + break; + case TEST_MSG_OP_MUL64: + expected_kern *= operand_64; + break; + case TEST_MSG_OP_MUL32: + expected_kern *= operand_32; + break; + default: + PRINT_FAIL("Unexpected op %d\n", op); + goto cleanup; + } + + if (i % 8 == 0) { + kick_kernel_read_messages(); + ASSERT_EQ(skel->bss->kern_mutated, expected_kern, "expected_kern"); + ASSERT_EQ(skel->bss->err, 0, "bpf_prog_err"); + drain_kernel_messages_buffer(kern_ringbuf, skel); + } + } + +cleanup: + ring_buffer__free(kern_ringbuf); + user_ring_buffer__free(user_ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void *kick_kernel_cb(void *arg) +{ + /* Kick the kernel, causing it to drain the ring buffer and then wake + * up the test thread waiting on epoll. + */ + syscall(__NR_getrlimit); + + return NULL; +} + +static int spawn_kick_thread_for_poll(void) +{ + pthread_t thread; + + return pthread_create(&thread, NULL, kick_kernel_cb, NULL); +} + +static void test_user_ringbuf_blocking_reserve(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err, num_written = 0; + __u64 *token; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (err) + return; + + ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before"); + + while (1) { + /* Write samples until the buffer is full. */ + token = user_ring_buffer__reserve(ringbuf, sizeof(*token)); + if (!token) + break; + + *token = 0xdeadbeef; + + user_ring_buffer__submit(ringbuf, token); + num_written++; + } + + if (!ASSERT_GE(num_written, 0, "num_written")) + goto cleanup; + + /* Should not have read any samples until the kernel is kicked. */ + ASSERT_EQ(skel->bss->read, 0, "num_pre_kick"); + + /* We correctly time out after 1 second, without a sample. */ + token = user_ring_buffer__reserve_blocking(ringbuf, sizeof(*token), 1000); + if (!ASSERT_EQ(token, NULL, "pre_kick_timeout_token")) + goto cleanup; + + err = spawn_kick_thread_for_poll(); + if (!ASSERT_EQ(err, 0, "deferred_kick_thread\n")) + goto cleanup; + + /* After spawning another thread that asychronously kicks the kernel to + * drain the messages, we're able to block and successfully get a + * sample once we receive an event notification. + */ + token = user_ring_buffer__reserve_blocking(ringbuf, sizeof(*token), 10000); + + if (!ASSERT_OK_PTR(token, "block_token")) + goto cleanup; + + ASSERT_GT(skel->bss->read, 0, "num_post_kill"); + ASSERT_LE(skel->bss->read, num_written, "num_post_kill"); + ASSERT_EQ(skel->bss->err, 0, "err_post_poll"); + user_ring_buffer__discard(ringbuf, token); + +cleanup: + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static struct { + const char *prog_name; + const char *expected_err_msg; +} failure_tests[] = { + /* failure cases */ + {"user_ringbuf_callback_bad_access1", "negative offset dynptr_ptr ptr"}, + {"user_ringbuf_callback_bad_access2", "dereference of modified dynptr_ptr ptr"}, + {"user_ringbuf_callback_write_forbidden", "invalid mem access 'dynptr_ptr'"}, + {"user_ringbuf_callback_null_context_write", "invalid mem access 'scalar'"}, + {"user_ringbuf_callback_null_context_read", "invalid mem access 'scalar'"}, + {"user_ringbuf_callback_discard_dynptr", "arg 1 is an unacquired reference"}, + {"user_ringbuf_callback_submit_dynptr", "arg 1 is an unacquired reference"}, + {"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"}, +}; + +#define SUCCESS_TEST(_func) { _func, #_func } + +static struct { + void (*test_callback)(void); + const char *test_name; +} success_tests[] = { + SUCCESS_TEST(test_user_ringbuf_mappings), + SUCCESS_TEST(test_user_ringbuf_post_misaligned), + SUCCESS_TEST(test_user_ringbuf_post_producer_wrong_offset), + SUCCESS_TEST(test_user_ringbuf_post_larger_than_ringbuf_sz), + SUCCESS_TEST(test_user_ringbuf_basic), + SUCCESS_TEST(test_user_ringbuf_sample_full_ring_buffer), + SUCCESS_TEST(test_user_ringbuf_post_alignment_autoadjust), + SUCCESS_TEST(test_user_ringbuf_overfill), + SUCCESS_TEST(test_user_ringbuf_discards_properly_ignored), + SUCCESS_TEST(test_user_ringbuf_loop), + SUCCESS_TEST(test_user_ringbuf_msg_protocol), + SUCCESS_TEST(test_user_ringbuf_blocking_reserve), +}; + +static void verify_fail(const char *prog_name, const char *expected_err_msg) +{ + LIBBPF_OPTS(bpf_object_open_opts, opts); + struct bpf_program *prog; + struct user_ringbuf_fail *skel; + int err; + + opts.kernel_log_buf = obj_log_buf; + opts.kernel_log_size = log_buf_sz; + opts.kernel_log_level = 1; + + skel = user_ringbuf_fail__open_opts(&opts); + if (!ASSERT_OK_PTR(skel, "dynptr_fail__open_opts")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; + + bpf_program__set_autoload(prog, true); + + bpf_map__set_max_entries(skel->maps.user_ringbuf, getpagesize()); + + err = user_ringbuf_fail__load(skel); + if (!ASSERT_ERR(err, "unexpected load success")) + goto cleanup; + + if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) { + fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg); + fprintf(stderr, "Verifier output: %s\n", obj_log_buf); + } + +cleanup: + user_ringbuf_fail__destroy(skel); +} + +void test_user_ringbuf(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(success_tests); i++) { + if (!test__start_subtest(success_tests[i].test_name)) + continue; + + success_tests[i].test_callback(); + } + + for (i = 0; i < ARRAY_SIZE(failure_tests); i++) { + if (!test__start_subtest(failure_tests[i].prog_name)) + continue; + + verify_fail(failure_tests[i].prog_name, failure_tests[i].expected_err_msg); + } +} diff --git a/tools/testing/selftests/bpf/progs/test_user_ringbuf.h b/tools/testing/selftests/bpf/progs/test_user_ringbuf.h new file mode 100644 index 000000000000..1643b4d59ba7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_user_ringbuf.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#ifndef _TEST_USER_RINGBUF_H +#define _TEST_USER_RINGBUF_H + +#define TEST_OP_64 4 +#define TEST_OP_32 2 + +enum test_msg_op { + TEST_MSG_OP_INC64, + TEST_MSG_OP_INC32, + TEST_MSG_OP_MUL64, + TEST_MSG_OP_MUL32, + + // Must come last. + TEST_MSG_OP_NUM_OPS, +}; + +struct test_msg { + enum test_msg_op msg_op; + union { + __s64 operand_64; + __s32 operand_32; + }; +}; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +#endif /* _TEST_USER_RINGBUF_H */ diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c new file mode 100644 index 000000000000..82aba4529aa9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_USER_RINGBUF); +} user_ringbuf SEC(".maps"); + +static long +bad_access1(struct bpf_dynptr *dynptr, void *context) +{ + const struct sample *sample; + + sample = bpf_dynptr_data(dynptr - 1, 0, sizeof(*sample)); + bpf_printk("Was able to pass bad pointer %lx\n", (__u64)dynptr - 1); + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to read before the pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_bad_access1(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, bad_access1, NULL, 0); + + return 0; +} + +static long +bad_access2(struct bpf_dynptr *dynptr, void *context) +{ + const struct sample *sample; + + sample = bpf_dynptr_data(dynptr + 1, 0, sizeof(*sample)); + bpf_printk("Was able to pass bad pointer %lx\n", (__u64)dynptr + 1); + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to read past the end of the pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_bad_access2(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, bad_access2, NULL, 0); + + return 0; +} + +static long +write_forbidden(struct bpf_dynptr *dynptr, void *context) +{ + *((long *)dynptr) = 0; + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to write to that pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_write_forbidden(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, write_forbidden, NULL, 0); + + return 0; +} + +static long +null_context_write(struct bpf_dynptr *dynptr, void *context) +{ + *((__u64 *)context) = 0; + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to write to that pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_null_context_write(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, null_context_write, NULL, 0); + + return 0; +} + +static long +null_context_read(struct bpf_dynptr *dynptr, void *context) +{ + __u64 id = *((__u64 *)context); + + bpf_printk("Read id %lu\n", id); + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to write to that pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_null_context_read(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, null_context_read, NULL, 0); + + return 0; +} + +static long +try_discard_dynptr(struct bpf_dynptr *dynptr, void *context) +{ + bpf_ringbuf_discard_dynptr(dynptr, 0); + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to read past the end of the pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_discard_dynptr(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, try_discard_dynptr, NULL, 0); + + return 0; +} + +static long +try_submit_dynptr(struct bpf_dynptr *dynptr, void *context) +{ + bpf_ringbuf_submit_dynptr(dynptr, 0); + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to read past the end of the pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_submit_dynptr(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, try_submit_dynptr, NULL, 0); + + return 0; +} + +static long +invalid_drain_callback_return(struct bpf_dynptr *dynptr, void *context) +{ + return 2; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to write to that pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_invalid_return(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, invalid_drain_callback_return, NULL, 0); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_success.c b/tools/testing/selftests/bpf/progs/user_ringbuf_success.c new file mode 100644 index 000000000000..099c23d9aa21 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/user_ringbuf_success.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "test_user_ringbuf.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_USER_RINGBUF); +} user_ringbuf SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} kernel_ringbuf SEC(".maps"); + +/* inputs */ +int pid, err, val; + +int read = 0; + +/* Counter used for end-to-end protocol test */ +__u64 kern_mutated = 0; +__u64 user_mutated = 0; +__u64 expected_user_mutated = 0; + +static int +is_test_process(void) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + + return cur_pid == pid; +} + +static long +record_sample(struct bpf_dynptr *dynptr, void *context) +{ + const struct sample *sample = NULL; + struct sample stack_sample; + int status; + static int num_calls; + + if (num_calls++ % 2 == 0) { + status = bpf_dynptr_read(&stack_sample, sizeof(stack_sample), dynptr, 0, 0); + if (status) { + bpf_printk("bpf_dynptr_read() failed: %d\n", status); + err = 1; + return 0; + } + } else { + sample = bpf_dynptr_data(dynptr, 0, sizeof(*sample)); + if (!sample) { + bpf_printk("Unexpectedly failed to get sample\n"); + err = 2; + return 0; + } + stack_sample = *sample; + } + + __sync_fetch_and_add(&read, 1); + return 0; +} + +static void +handle_sample_msg(const struct test_msg *msg) +{ + switch (msg->msg_op) { + case TEST_MSG_OP_INC64: + kern_mutated += msg->operand_64; + break; + case TEST_MSG_OP_INC32: + kern_mutated += msg->operand_32; + break; + case TEST_MSG_OP_MUL64: + kern_mutated *= msg->operand_64; + break; + case TEST_MSG_OP_MUL32: + kern_mutated *= msg->operand_32; + break; + default: + bpf_printk("Unrecognized op %d\n", msg->msg_op); + err = 2; + } +} + +static long +read_protocol_msg(struct bpf_dynptr *dynptr, void *context) +{ + const struct test_msg *msg = NULL; + + msg = bpf_dynptr_data(dynptr, 0, sizeof(*msg)); + if (!msg) { + err = 1; + bpf_printk("Unexpectedly failed to get msg\n"); + return 0; + } + + handle_sample_msg(msg); + + return 0; +} + +static int publish_next_kern_msg(__u32 index, void *context) +{ + struct test_msg *msg = NULL; + int operand_64 = TEST_OP_64; + int operand_32 = TEST_OP_32; + + msg = bpf_ringbuf_reserve(&kernel_ringbuf, sizeof(*msg), 0); + if (!msg) { + err = 4; + return 1; + } + + switch (index % TEST_MSG_OP_NUM_OPS) { + case TEST_MSG_OP_INC64: + msg->operand_64 = operand_64; + msg->msg_op = TEST_MSG_OP_INC64; + expected_user_mutated += operand_64; + break; + case TEST_MSG_OP_INC32: + msg->operand_32 = operand_32; + msg->msg_op = TEST_MSG_OP_INC32; + expected_user_mutated += operand_32; + break; + case TEST_MSG_OP_MUL64: + msg->operand_64 = operand_64; + msg->msg_op = TEST_MSG_OP_MUL64; + expected_user_mutated *= operand_64; + break; + case TEST_MSG_OP_MUL32: + msg->operand_32 = operand_32; + msg->msg_op = TEST_MSG_OP_MUL32; + expected_user_mutated *= operand_32; + break; + default: + bpf_ringbuf_discard(msg, 0); + err = 5; + return 1; + } + + bpf_ringbuf_submit(msg, 0); + + return 0; +} + +static void +publish_kern_messages(void) +{ + if (expected_user_mutated != user_mutated) { + bpf_printk("%lu != %lu\n", expected_user_mutated, user_mutated); + err = 3; + return; + } + + bpf_loop(8, publish_next_kern_msg, NULL, 0); +} + +SEC("fentry/" SYS_PREFIX "sys_prctl") +int test_user_ringbuf_protocol(void *ctx) +{ + long status = 0; + struct sample *sample = NULL; + struct bpf_dynptr ptr; + + if (!is_test_process()) + return 0; + + status = bpf_user_ringbuf_drain(&user_ringbuf, read_protocol_msg, NULL, 0); + if (status < 0) { + bpf_printk("Drain returned: %ld\n", status); + err = 1; + return 0; + } + + publish_kern_messages(); + + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_user_ringbuf(void *ctx) +{ + int status = 0; + struct sample *sample = NULL; + struct bpf_dynptr ptr; + + if (!is_test_process()) + return 0; + + err = bpf_user_ringbuf_drain(&user_ringbuf, record_sample, NULL, 0); + + return 0; +} + +static long +do_nothing_cb(struct bpf_dynptr *dynptr, void *context) +{ + __sync_fetch_and_add(&read, 1); + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_getrlimit") +int test_user_ringbuf_epoll(void *ctx) +{ + long num_samples; + + if (!is_test_process()) + return 0; + + num_samples = bpf_user_ringbuf_drain(&user_ringbuf, do_nothing_cb, NULL, 0); + if (num_samples <= 0) + err = 1; + + return 0; +} From 9f2f5d7830ddfeeca147595f473e14eadbeb3db1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 9 Sep 2022 19:52:14 -0700 Subject: [PATCH 058/143] libbpf: Improve BPF_PROG2 macro code quality and description Commit 34586d29f8df ("libbpf: Add new BPF_PROG2 macro") added BPF_PROG2 macro for trampoline based programs with struct arguments. Andrii made a few suggestions to improve code quality and description. This patch implemented these suggestions including better internal macro name, consistent usage pattern for __builtin_choose_expr(), simpler macro definition for always-inline func arguments and better macro description. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20220910025214.1536510-1-yhs@fb.com --- tools/lib/bpf/bpf_tracing.h | 152 +++++++++++++++++++++--------------- 1 file changed, 90 insertions(+), 62 deletions(-) diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 8d4bdd18cb3d..2972dc25ff72 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -438,84 +438,112 @@ typeof(name(0)) name(unsigned long long *ctx) \ static __always_inline typeof(name(0)) \ ____##name(unsigned long long *ctx, ##args) -#ifndef ____bpf_nth -#define ____bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N +#ifndef ___bpf_nth2 +#define ___bpf_nth2(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \ + _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N #endif -#ifndef ____bpf_narg -#define ____bpf_narg(...) ____bpf_nth(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0) +#ifndef ___bpf_narg2 +#define ___bpf_narg2(...) \ + ___bpf_nth2(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, \ + 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0) #endif -#define BPF_REG_CNT(t) \ - (__builtin_choose_expr(sizeof(t) == 1 || sizeof(t) == 2 || sizeof(t) == 4 || sizeof(t) == 8, 1, \ - __builtin_choose_expr(sizeof(t) == 16, 2, \ - (void)0))) - -#define ____bpf_reg_cnt0() (0) -#define ____bpf_reg_cnt1(t, x) (____bpf_reg_cnt0() + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt2(t, x, args...) (____bpf_reg_cnt1(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt3(t, x, args...) (____bpf_reg_cnt2(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt4(t, x, args...) (____bpf_reg_cnt3(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt5(t, x, args...) (____bpf_reg_cnt4(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt6(t, x, args...) (____bpf_reg_cnt5(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt7(t, x, args...) (____bpf_reg_cnt6(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt8(t, x, args...) (____bpf_reg_cnt7(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt9(t, x, args...) (____bpf_reg_cnt8(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt10(t, x, args...) (____bpf_reg_cnt9(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt11(t, x, args...) (____bpf_reg_cnt10(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt12(t, x, args...) (____bpf_reg_cnt11(args) + BPF_REG_CNT(t)) -#define ____bpf_reg_cnt(args...) ___bpf_apply(____bpf_reg_cnt, ____bpf_narg(args))(args) - -#define ____bpf_union_arg(t, x, n) \ - __builtin_choose_expr(sizeof(t) == 1, ({ union { struct { __u8 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]}}; ___tmp.x; }), \ - __builtin_choose_expr(sizeof(t) == 2, ({ union { struct { __u16 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]} }; ___tmp.x; }), \ - __builtin_choose_expr(sizeof(t) == 4, ({ union { struct { __u32 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]} }; ___tmp.x; }), \ - __builtin_choose_expr(sizeof(t) == 8, ({ union { struct { __u64 x; } ___z; t x; } ___tmp = {.___z = {ctx[n]} }; ___tmp.x; }), \ - __builtin_choose_expr(sizeof(t) == 16, ({ union { struct { __u64 x, y; } ___z; t x; } ___tmp = {.___z = {ctx[n], ctx[n + 1]} }; ___tmp.x; }), \ +#define ___bpf_treg_cnt(t) \ + __builtin_choose_expr(sizeof(t) == 1, 1, \ + __builtin_choose_expr(sizeof(t) == 2, 1, \ + __builtin_choose_expr(sizeof(t) == 4, 1, \ + __builtin_choose_expr(sizeof(t) == 8, 1, \ + __builtin_choose_expr(sizeof(t) == 16, 2, \ (void)0))))) -#define ____bpf_ctx_arg0(n, args...) -#define ____bpf_ctx_arg1(n, t, x) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt1(t, x)) -#define ____bpf_ctx_arg2(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt2(t, x, args)) ____bpf_ctx_arg1(n, args) -#define ____bpf_ctx_arg3(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt3(t, x, args)) ____bpf_ctx_arg2(n, args) -#define ____bpf_ctx_arg4(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt4(t, x, args)) ____bpf_ctx_arg3(n, args) -#define ____bpf_ctx_arg5(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt5(t, x, args)) ____bpf_ctx_arg4(n, args) -#define ____bpf_ctx_arg6(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt6(t, x, args)) ____bpf_ctx_arg5(n, args) -#define ____bpf_ctx_arg7(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt7(t, x, args)) ____bpf_ctx_arg6(n, args) -#define ____bpf_ctx_arg8(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt8(t, x, args)) ____bpf_ctx_arg7(n, args) -#define ____bpf_ctx_arg9(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt9(t, x, args)) ____bpf_ctx_arg8(n, args) -#define ____bpf_ctx_arg10(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt10(t, x, args)) ____bpf_ctx_arg9(n, args) -#define ____bpf_ctx_arg11(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt11(t, x, args)) ____bpf_ctx_arg10(n, args) -#define ____bpf_ctx_arg12(n, t, x, args...) , ____bpf_union_arg(t, x, n - ____bpf_reg_cnt12(t, x, args)) ____bpf_ctx_arg11(n, args) -#define ____bpf_ctx_arg(n, args...) ___bpf_apply(____bpf_ctx_arg, ____bpf_narg(args))(n, args) +#define ___bpf_reg_cnt0() (0) +#define ___bpf_reg_cnt1(t, x) (___bpf_reg_cnt0() + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt2(t, x, args...) (___bpf_reg_cnt1(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt3(t, x, args...) (___bpf_reg_cnt2(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt4(t, x, args...) (___bpf_reg_cnt3(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt5(t, x, args...) (___bpf_reg_cnt4(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt6(t, x, args...) (___bpf_reg_cnt5(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt7(t, x, args...) (___bpf_reg_cnt6(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt8(t, x, args...) (___bpf_reg_cnt7(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt9(t, x, args...) (___bpf_reg_cnt8(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt10(t, x, args...) (___bpf_reg_cnt9(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt11(t, x, args...) (___bpf_reg_cnt10(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt12(t, x, args...) (___bpf_reg_cnt11(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt(args...) ___bpf_apply(___bpf_reg_cnt, ___bpf_narg2(args))(args) -#define ____bpf_ctx_decl0() -#define ____bpf_ctx_decl1(t, x) , t x -#define ____bpf_ctx_decl2(t, x, args...) , t x ____bpf_ctx_decl1(args) -#define ____bpf_ctx_decl3(t, x, args...) , t x ____bpf_ctx_decl2(args) -#define ____bpf_ctx_decl4(t, x, args...) , t x ____bpf_ctx_decl3(args) -#define ____bpf_ctx_decl5(t, x, args...) , t x ____bpf_ctx_decl4(args) -#define ____bpf_ctx_decl6(t, x, args...) , t x ____bpf_ctx_decl5(args) -#define ____bpf_ctx_decl7(t, x, args...) , t x ____bpf_ctx_decl6(args) -#define ____bpf_ctx_decl8(t, x, args...) , t x ____bpf_ctx_decl7(args) -#define ____bpf_ctx_decl9(t, x, args...) , t x ____bpf_ctx_decl8(args) -#define ____bpf_ctx_decl10(t, x, args...) , t x ____bpf_ctx_decl9(args) -#define ____bpf_ctx_decl11(t, x, args...) , t x ____bpf_ctx_decl10(args) -#define ____bpf_ctx_decl12(t, x, args...) , t x ____bpf_ctx_decl11(args) -#define ____bpf_ctx_decl(args...) ___bpf_apply(____bpf_ctx_decl, ____bpf_narg(args))(args) +#define ___bpf_union_arg(t, x, n) \ + __builtin_choose_expr(sizeof(t) == 1, ({ union { __u8 z[1]; t x; } ___t = { .z = {ctx[n]}}; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 2, ({ union { __u16 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 4, ({ union { __u32 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 8, ({ union { __u64 z[1]; t x; } ___t = {.z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 16, ({ union { __u64 z[2]; t x; } ___t = {.z = {ctx[n], ctx[n + 1]} }; ___t.x; }), \ + (void)0))))) + +#define ___bpf_ctx_arg0(n, args...) +#define ___bpf_ctx_arg1(n, t, x) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt1(t, x)) +#define ___bpf_ctx_arg2(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt2(t, x, args)) ___bpf_ctx_arg1(n, args) +#define ___bpf_ctx_arg3(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt3(t, x, args)) ___bpf_ctx_arg2(n, args) +#define ___bpf_ctx_arg4(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt4(t, x, args)) ___bpf_ctx_arg3(n, args) +#define ___bpf_ctx_arg5(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt5(t, x, args)) ___bpf_ctx_arg4(n, args) +#define ___bpf_ctx_arg6(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt6(t, x, args)) ___bpf_ctx_arg5(n, args) +#define ___bpf_ctx_arg7(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt7(t, x, args)) ___bpf_ctx_arg6(n, args) +#define ___bpf_ctx_arg8(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt8(t, x, args)) ___bpf_ctx_arg7(n, args) +#define ___bpf_ctx_arg9(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt9(t, x, args)) ___bpf_ctx_arg8(n, args) +#define ___bpf_ctx_arg10(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt10(t, x, args)) ___bpf_ctx_arg9(n, args) +#define ___bpf_ctx_arg11(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt11(t, x, args)) ___bpf_ctx_arg10(n, args) +#define ___bpf_ctx_arg12(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt12(t, x, args)) ___bpf_ctx_arg11(n, args) +#define ___bpf_ctx_arg(args...) ___bpf_apply(___bpf_ctx_arg, ___bpf_narg2(args))(___bpf_reg_cnt(args), args) + +#define ___bpf_ctx_decl0() +#define ___bpf_ctx_decl1(t, x) , t x +#define ___bpf_ctx_decl2(t, x, args...) , t x ___bpf_ctx_decl1(args) +#define ___bpf_ctx_decl3(t, x, args...) , t x ___bpf_ctx_decl2(args) +#define ___bpf_ctx_decl4(t, x, args...) , t x ___bpf_ctx_decl3(args) +#define ___bpf_ctx_decl5(t, x, args...) , t x ___bpf_ctx_decl4(args) +#define ___bpf_ctx_decl6(t, x, args...) , t x ___bpf_ctx_decl5(args) +#define ___bpf_ctx_decl7(t, x, args...) , t x ___bpf_ctx_decl6(args) +#define ___bpf_ctx_decl8(t, x, args...) , t x ___bpf_ctx_decl7(args) +#define ___bpf_ctx_decl9(t, x, args...) , t x ___bpf_ctx_decl8(args) +#define ___bpf_ctx_decl10(t, x, args...) , t x ___bpf_ctx_decl9(args) +#define ___bpf_ctx_decl11(t, x, args...) , t x ___bpf_ctx_decl10(args) +#define ___bpf_ctx_decl12(t, x, args...) , t x ___bpf_ctx_decl11(args) +#define ___bpf_ctx_decl(args...) ___bpf_apply(___bpf_ctx_decl, ___bpf_narg2(args))(args) /* - * BPF_PROG2 can handle struct arguments. + * BPF_PROG2 is an enhanced version of BPF_PROG in order to handle struct + * arguments. Since each struct argument might take one or two u64 values + * in the trampoline stack, argument type size is needed to place proper number + * of u64 values for each argument. Therefore, BPF_PROG2 has different + * syntax from BPF_PROG. For example, for the following BPF_PROG syntax: + * + * int BPF_PROG(test2, int a, int b) { ... } + * + * the corresponding BPF_PROG2 syntax is: + * + * int BPF_PROG2(test2, int, a, int, b) { ... } + * + * where type and the corresponding argument name are separated by comma. + * + * Use BPF_PROG2 macro if one of the arguments might be a struct/union larger + * than 8 bytes: + * + * int BPF_PROG2(test_struct_arg, struct bpf_testmod_struct_arg_1, a, int, b, + * int, c, int, d, struct bpf_testmod_struct_arg_2, e, int, ret) + * { + * // access a, b, c, d, e, and ret directly + * ... + * } */ #define BPF_PROG2(name, args...) \ name(unsigned long long *ctx); \ static __always_inline typeof(name(0)) \ -____##name(unsigned long long *ctx ____bpf_ctx_decl(args)); \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)); \ typeof(name(0)) name(unsigned long long *ctx) \ { \ - return ____##name(ctx ____bpf_ctx_arg(____bpf_reg_cnt(args), args)); \ + return ____##name(ctx ___bpf_ctx_arg(args)); \ } \ static __always_inline typeof(name(0)) \ -____##name(unsigned long long *ctx ____bpf_ctx_decl(args)) +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)) struct pt_regs; From 272d1f4cfa3c75d4828b62ef33ccb207da3b7350 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Thu, 8 Sep 2022 15:01:46 +0300 Subject: [PATCH 059/143] selftests: bpf: test_kmod.sh: Pass parameters to the module It's possible to specify particular tests for test_bpf.ko with module parameters. Make it possible to pass the module parameters, example: test_kmod.sh test_range=1,3 Since magnitude tests take long time it can be reasonable to skip them. Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220908120146.381218-1-ykaliuta@redhat.com --- tools/testing/selftests/bpf/test_kmod.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh index 4f6444bcd53f..d4a4279c0181 100755 --- a/tools/testing/selftests/bpf/test_kmod.sh +++ b/tools/testing/selftests/bpf/test_kmod.sh @@ -1,6 +1,11 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 +# Usage: +# ./test_kmod.sh [module_param]... +# Ex.: ./test_kmod.sh test_range=1,3 +# All the parameters are passed to the kernel module. + # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 @@ -26,15 +31,15 @@ test_run() echo "[ JIT enabled:$1 hardened:$2 ]" dmesg -C if [ -f ${OUTPUT}/lib/test_bpf.ko ]; then - insmod ${OUTPUT}/lib/test_bpf.ko 2> /dev/null + insmod ${OUTPUT}/lib/test_bpf.ko "$@" 2> /dev/null if [ $? -ne 0 ]; then rc=1 fi else # Use modprobe dry run to check for missing test_bpf module - if ! /sbin/modprobe -q -n test_bpf; then + if ! /sbin/modprobe -q -n test_bpf "$@"; then echo "test_bpf: [SKIP]" - elif /sbin/modprobe -q test_bpf; then + elif /sbin/modprobe -q test_bpf "$@"; then echo "test_bpf: ok" else echo "test_bpf: [FAIL]" From 01f2e36c959c813a532ae836db49b2ac9de46de4 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Tue, 13 Sep 2022 00:43:00 +0800 Subject: [PATCH 060/143] libbpf: Support raw BTF placed in the default search path Currently, the default vmlinux files at '/boot/vmlinux-*', '/lib/modules/*/vmlinux-*' etc. are parsed with 'btf__parse_elf()' to extract BTF. It is possible that these files are actually raw BTF files similar to /sys/kernel/btf/vmlinux. So parse these files with 'btf__parse' which tries both raw format and ELF format. This might be useful in some scenarios where users put their custom BTF into known locations and don't want to specify btf_custom_path option. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/3f59fb5a345d2e4f10e16fe9e35fbc4c03ecaa3e.1662999860.git.chentao.kernel@linux.alibaba.com --- tools/lib/bpf/btf.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 361131518d63..b4d9a96c3c1b 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4642,20 +4642,17 @@ static int btf_dedup_remap_types(struct btf_dedup *d) */ struct btf *btf__load_vmlinux_btf(void) { - struct { - const char *path_fmt; - bool raw_btf; - } locations[] = { + const char *locations[] = { /* try canonical vmlinux BTF through sysfs first */ - { "/sys/kernel/btf/vmlinux", true /* raw BTF */ }, - /* fall back to trying to find vmlinux ELF on disk otherwise */ - { "/boot/vmlinux-%1$s" }, - { "/lib/modules/%1$s/vmlinux-%1$s" }, - { "/lib/modules/%1$s/build/vmlinux" }, - { "/usr/lib/modules/%1$s/kernel/vmlinux" }, - { "/usr/lib/debug/boot/vmlinux-%1$s" }, - { "/usr/lib/debug/boot/vmlinux-%1$s.debug" }, - { "/usr/lib/debug/lib/modules/%1$s/vmlinux" }, + "/sys/kernel/btf/vmlinux", + /* fall back to trying to find vmlinux on disk otherwise */ + "/boot/vmlinux-%1$s", + "/lib/modules/%1$s/vmlinux-%1$s", + "/lib/modules/%1$s/build/vmlinux", + "/usr/lib/modules/%1$s/kernel/vmlinux", + "/usr/lib/debug/boot/vmlinux-%1$s", + "/usr/lib/debug/boot/vmlinux-%1$s.debug", + "/usr/lib/debug/lib/modules/%1$s/vmlinux", }; char path[PATH_MAX + 1]; struct utsname buf; @@ -4665,15 +4662,12 @@ struct btf *btf__load_vmlinux_btf(void) uname(&buf); for (i = 0; i < ARRAY_SIZE(locations); i++) { - snprintf(path, PATH_MAX, locations[i].path_fmt, buf.release); + snprintf(path, PATH_MAX, locations[i], buf.release); if (access(path, R_OK)) continue; - if (locations[i].raw_btf) - btf = btf__parse_raw(path); - else - btf = btf__parse_elf(path, NULL); + btf = btf__parse(path, NULL); err = libbpf_get_error(btf); pr_debug("loading kernel BTF '%s': %d\n", path, err); if (err) From d15bf1501c7533826a616478002c601fcc7671f3 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 20 Sep 2022 09:59:39 +0200 Subject: [PATCH 061/143] bpf: Allow kfuncs to be used in LSM programs In preparation for the addition of new kfuncs, allow kfuncs defined in the tracing subsystem to be used in LSM programs by mapping the LSM program type to the TRACING hook. Signed-off-by: KP Singh Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-2-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6ccd4f4d731e..dbcf020c4124 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7376,6 +7376,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_STRUCT_OPS: return BTF_KFUNC_HOOK_STRUCT_OPS; case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: return BTF_KFUNC_HOOK_TRACING; case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; From 00f146413ccb6c84308e559281449755c83f54c5 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:40 +0200 Subject: [PATCH 062/143] btf: Export bpf_dynptr definition eBPF dynamic pointers is a new feature recently added to upstream. It binds together a pointer to a memory area and its size. The internal kernel structure bpf_dynptr_kern is not accessible by eBPF programs in user space. They instead see bpf_dynptr, which is then translated to the internal kernel structure by the eBPF verifier. The problem is that it is not possible to include at the same time the uapi include linux/bpf.h and the vmlinux BTF vmlinux.h, as they both contain the definition of some structures/enums. The compiler complains saying that the structures/enums are redefined. As bpf_dynptr is defined in the uapi include linux/bpf.h, this makes it impossible to include vmlinux.h. However, in some cases, e.g. when using kfuncs, vmlinux.h has to be included. The only option until now was to include vmlinux.h and add the definition of bpf_dynptr directly in the eBPF program source code from linux/bpf.h. Solve the problem by using the same approach as for bpf_timer (which also follows the same scheme with the _kern suffix for the internal kernel structure). Add the following line in one of the dynamic pointer helpers, bpf_dynptr_from_mem(): BTF_TYPE_EMIT(struct bpf_dynptr); Cc: stable@vger.kernel.org Cc: Joanne Koong Fixes: 97e03f521050c ("bpf: Add verifier support for dynptrs") Signed-off-by: Roberto Sassu Acked-by: Yonghong Song Tested-by: KP Singh Link: https://lore.kernel.org/r/20220920075951.929132-3-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cb5564c77482..6d69e30f42d8 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1446,6 +1446,8 @@ BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_ { int err; + BTF_TYPE_EMIT(struct bpf_dynptr); + err = bpf_dynptr_check_size(size); if (err) goto error; From e9e315b4a5de32d0482b92f482517095d5d844e4 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:41 +0200 Subject: [PATCH 063/143] bpf: Move dynptr type check to is_dynptr_type_expected() Move dynptr type check to is_dynptr_type_expected() from is_dynptr_reg_valid_init(), so that callers can better determine the cause of a negative result (dynamic pointer not valid/initialized, dynamic pointer of the wrong type). It will be useful for example for BTF, to restrict which dynamic pointer types can be passed to kfuncs, as initially only the local type will be supported. Also, splitting makes the code more readable, since checking the dynamic pointer type is not necessarily related to validity and initialization. Split the validity/initialization and dynamic pointer type check also in the verifier, and adjust the expected error message in the test (a test for an unexpected dynptr type passed to a helper cannot be added due to missing suitable helpers, but this case has been tested manually). Cc: Joanne Koong Cc: Kumar Kartikeya Dwivedi Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-4-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 35 ++++++++++++++----- .../testing/selftests/bpf/prog_tests/dynptr.c | 2 +- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c76fa45a5906..c08dde19eb67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -782,8 +782,8 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ return true; } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); int spi = get_spi(reg->off); @@ -799,11 +799,24 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re return false; } + return true; +} + +static bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) +{ + struct bpf_func_state *state = func(env, reg); + enum bpf_dynptr_type dynptr_type; + int spi = get_spi(reg->off); + /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ if (arg_type == ARG_PTR_TO_DYNPTR) return true; - return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type); + dynptr_type = arg_to_dynptr_type(arg_type); + + return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; } /* The reg state of a pointer or a bounded scalar was saved when @@ -6095,21 +6108,27 @@ skip_type_check: } meta->uninit_dynptr_regno = regno; - } else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) { + } else if (!is_dynptr_reg_valid_init(env, reg)) { + verbose(env, + "Expected an initialized dynptr as arg #%d\n", + arg + 1); + return -EINVAL; + } else if (!is_dynptr_type_expected(env, reg, arg_type)) { const char *err_extra = ""; switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { case DYNPTR_TYPE_LOCAL: - err_extra = "local "; + err_extra = "local"; break; case DYNPTR_TYPE_RINGBUF: - err_extra = "ringbuf "; + err_extra = "ringbuf"; break; default: + err_extra = ""; break; } - - verbose(env, "Expected an initialized %sdynptr as arg #%d\n", + verbose(env, + "Expected a dynptr of type %s as arg #%d\n", err_extra, arg + 1); return -EINVAL; } diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index bcf80b9f7c27..8fc4e6c02bfd 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -30,7 +30,7 @@ static struct { {"invalid_helper2", "Expected an initialized dynptr as arg #3"}, {"invalid_write1", "Expected an initialized dynptr as arg #1"}, {"invalid_write2", "Expected an initialized dynptr as arg #3"}, - {"invalid_write3", "Expected an initialized ringbuf dynptr as arg #1"}, + {"invalid_write3", "Expected an initialized dynptr as arg #1"}, {"invalid_write4", "arg 1 is an unacquired reference"}, {"invalid_read1", "invalid read from stack"}, {"invalid_read2", "cannot pass in dynptr at an offset"}, From b8d31762a0ae6861e1115302ee338560d853e317 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:42 +0200 Subject: [PATCH 064/143] btf: Allow dynamic pointer parameters in kfuncs Allow dynamic pointers (struct bpf_dynptr_kern *) to be specified as parameters in kfuncs. Also, ensure that dynamic pointers passed as argument are valid and initialized, are a pointer to the stack, and of the type local. More dynamic pointer types can be supported in the future. To properly detect whether a parameter is of the desired type, introduce the stringify_struct() macro to compare the returned structure name with the desired name. In addition, protect against structure renames, by halting the build with BUILD_BUG_ON(), so that developers have to revisit the code. To check if a dynamic pointer passed to the kfunc is valid and initialized, and if its type is local, export the existing functions is_dynptr_reg_valid_init() and is_dynptr_type_expected(). Cc: Joanne Koong Cc: Kumar Kartikeya Dwivedi Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-5-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ include/linux/btf.h | 9 +++++++++ kernel/bpf/btf.c | 33 +++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 10 +++++----- 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e197f8fb27e2..9e1e6965f407 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -593,6 +593,11 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); +bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg); +bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/include/linux/btf.h b/include/linux/btf.h index 1fcc833a8690..f9aababc5d78 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -52,6 +52,15 @@ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ +/* + * Return the name of the passed struct, if exists, or halt the build if for + * example the structure gets renamed. In this way, developers have to revisit + * the code using that structure name, and update it accordingly. + */ +#define stringify_struct(x) \ + ({ BUILD_BUG_ON(sizeof(struct x) < 0); \ + __stringify(x); }) + struct btf; struct btf_member; struct btf_type; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbcf020c4124..13faede0f2b4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6449,15 +6449,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (is_kfunc) { bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); + bool arg_dynptr = btf_type_is_struct(ref_t) && + !strcmp(ref_tname, + stringify_struct(bpf_dynptr_kern)); /* Permit pointer to mem, but only when argument * type is pointer to scalar, or struct composed * (recursively) of scalars. * When arg_mem_size is true, the pointer can be * void *. + * Also permit initialized local dynamic pointers. */ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && + !arg_dynptr && (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { bpf_log(log, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", @@ -6465,6 +6470,34 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + if (arg_dynptr) { + if (reg->type != PTR_TO_STACK) { + bpf_log(log, "arg#%d pointer type %s %s not to stack\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (!is_dynptr_reg_valid_init(env, reg)) { + bpf_log(log, + "arg#%d pointer type %s %s must be valid and initialized\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (!is_dynptr_type_expected(env, reg, + ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) { + bpf_log(log, + "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + continue; + } + /* Check for mem, len pair */ if (arg_mem_size) { if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c08dde19eb67..6f6d2d511c06 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -782,8 +782,8 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ return true; } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); int spi = get_spi(reg->off); @@ -802,9 +802,9 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, return true; } -static bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type) +bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type dynptr_type; From 51df4865718540f51bb5d3e552c50dc88e1333d6 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:43 +0200 Subject: [PATCH 065/143] bpf: Export bpf_dynptr_get_size() Export bpf_dynptr_get_size(), so that kernel code dealing with eBPF dynamic pointers can obtain the real size of data carried by this data structure. Signed-off-by: Roberto Sassu Reviewed-by: Joanne Koong Acked-by: KP Singh Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-6-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/helpers.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 33e543b86e1a..6535fb1e21b9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2654,6 +2654,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); +u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6d69e30f42d8..b069517a3da0 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1408,7 +1408,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } From 90fd8f26edd47942203639bf3a5dde8fa1931a0e Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:44 +0200 Subject: [PATCH 066/143] KEYS: Move KEY_LOOKUP_ to include/linux/key.h and define KEY_LOOKUP_ALL In preparation for the patch that introduces the bpf_lookup_user_key() eBPF kfunc, move KEY_LOOKUP_ definitions to include/linux/key.h, to be able to validate the kfunc parameters. Add them to enum key_lookup_flag, so that all the current ones and the ones defined in the future are automatically exported through BTF and available to eBPF programs. Also, add KEY_LOOKUP_ALL to the enum, with the logical OR of currently defined flags as value, to facilitate checking whether a variable contains only those flags. Signed-off-by: Roberto Sassu Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20220920075951.929132-7-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/key.h | 6 ++++++ security/keys/internal.h | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/key.h b/include/linux/key.h index 7febc4881363..d27477faf00d 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -88,6 +88,12 @@ enum key_need_perm { KEY_DEFER_PERM_CHECK, /* Special: permission check is deferred */ }; +enum key_lookup_flag { + KEY_LOOKUP_CREATE = 0x01, + KEY_LOOKUP_PARTIAL = 0x02, + KEY_LOOKUP_ALL = (KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL), +}; + struct seq_file; struct user_struct; struct signal_struct; diff --git a/security/keys/internal.h b/security/keys/internal.h index 9b9cf3b6fcbb..3c1e7122076b 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -165,8 +165,6 @@ extern struct key *request_key_and_link(struct key_type *type, extern bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data); -#define KEY_LOOKUP_CREATE 0x01 -#define KEY_LOOKUP_PARTIAL 0x02 extern long join_session_keyring(const char *name); extern void key_change_session_keyring(struct callback_head *twork); From f3cf4134c5c6c47b9b5c7aa3cb2d67e107887a7b Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:45 +0200 Subject: [PATCH 067/143] bpf: Add bpf_lookup_*_key() and bpf_key_put() kfuncs Add the bpf_lookup_user_key(), bpf_lookup_system_key() and bpf_key_put() kfuncs, to respectively search a key with a given key handle serial number and flags, obtain a key from a pre-determined ID defined in include/linux/verification.h, and cleanup. Introduce system_keyring_id_check() to validate the keyring ID parameter of bpf_lookup_system_key(). Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Acked-by: Song Liu Link: https://lore.kernel.org/r/20220920075951.929132-8-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 +++ include/linux/verification.h | 8 +++ kernel/trace/bpf_trace.c | 135 +++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6535fb1e21b9..a1435b019aca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2664,4 +2664,12 @@ static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} static inline void bpf_cgroup_atype_put(int cgroup_atype) {} #endif /* CONFIG_BPF_LSM */ +struct key; + +#ifdef CONFIG_KEYS +struct bpf_key { + struct key *key; + bool has_ref; +}; +#endif /* CONFIG_KEYS */ #endif /* _LINUX_BPF_H */ diff --git a/include/linux/verification.h b/include/linux/verification.h index a655923335ae..f34e50ebcf60 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -17,6 +17,14 @@ #define VERIFY_USE_SECONDARY_KEYRING ((struct key *)1UL) #define VERIFY_USE_PLATFORM_KEYRING ((struct key *)2UL) +static inline int system_keyring_id_check(u64 id) +{ + if (id > (unsigned long)VERIFY_USE_PLATFORM_KEYRING) + return -EINVAL; + + return 0; +} + /* * The use to which an asymmetric key is being put. */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 68e5cdd24cef..ab183dbaa8d1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include @@ -1181,6 +1183,139 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +#ifdef CONFIG_KEYS +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "kfuncs which will be used in BPF programs"); + +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +__diag_pop(); + +BTF_SET8_START(key_sig_kfunc_set) +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +BTF_SET8_END(key_sig_kfunc_set) + +static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { + .owner = THIS_MODULE, + .set = &key_sig_kfunc_set, +}; + +static int __init bpf_key_sig_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_key_sig_kfunc_set); +} + +late_initcall(bpf_key_sig_kfuncs_init); +#endif /* CONFIG_KEYS */ + static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { From 865b0566d8f1a0c3937e5eb4bd6ba4ef03e7e98c Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:46 +0200 Subject: [PATCH 068/143] bpf: Add bpf_verify_pkcs7_signature() kfunc Add the bpf_verify_pkcs7_signature() kfunc, to give eBPF security modules the ability to check the validity of a signature against supplied data, by using user-provided or system-provided keys as trust anchor. The new kfunc makes it possible to enforce mandatory policies, as eBPF programs might be allowed to make security decisions only based on data sources the system administrator approves. The caller should provide the data to be verified and the signature as eBPF dynamic pointers (to minimize the number of parameters) and a bpf_key structure containing a reference to the keyring with keys trusted for signature verification, obtained from bpf_lookup_user_key() or bpf_lookup_system_key(). For bpf_key structures obtained from the former lookup function, bpf_verify_pkcs7_signature() completes the permission check deferred by that function by calling key_validate(). key_task_permission() is already called by the PKCS#7 code. Signed-off-by: Roberto Sassu Acked-by: KP Singh Acked-by: Song Liu Link: https://lore.kernel.org/r/20220920075951.929132-9-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 45 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ab183dbaa8d1..9df53c40cffd 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1294,12 +1294,57 @@ void bpf_key_put(struct bpf_key *bkey) kfree(bkey); } +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_ptr: data to verify + * @sig_ptr: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, + struct bpf_dynptr_kern *sig_ptr, + struct bpf_key *trusted_keyring) +{ + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + return verify_pkcs7_signature(data_ptr->data, + bpf_dynptr_get_size(data_ptr), + sig_ptr->data, + bpf_dynptr_get_size(sig_ptr), + trusted_keyring->key, + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + NULL); +} +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ + __diag_pop(); BTF_SET8_START(key_sig_kfunc_set) BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif BTF_SET8_END(key_sig_kfunc_set) static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { From 94fd7420faa0bc85341c0a9cbe5e5240ef4f123d Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:47 +0200 Subject: [PATCH 069/143] selftests/bpf: Compile kernel with everything as built-in MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the eBPF CI does not support kernel modules, change the kernel config to compile everything as built-in. Signed-off-by: Roberto Sassu Acked-by: Daniel Müller Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-10-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/config | 26 +++++++++++------------ tools/testing/selftests/bpf/config.x86_64 | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 3fc46f9cfb22..0fdd11e6b742 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -7,9 +7,9 @@ CONFIG_BPF_LSM=y CONFIG_BPF_STREAM_PARSER=y CONFIG_BPF_SYSCALL=y CONFIG_CGROUP_BPF=y -CONFIG_CRYPTO_HMAC=m -CONFIG_CRYPTO_SHA256=m -CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_USER_API_HASH=y CONFIG_DYNAMIC_FTRACE=y CONFIG_FPROBE=y CONFIG_FTRACE_SYSCALLS=y @@ -24,30 +24,30 @@ CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_RAW=y CONFIG_IP_NF_TARGET_SYNPROXY=y CONFIG_IPV6=y -CONFIG_IPV6_FOU=m -CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_FOU=y +CONFIG_IPV6_FOU_TUNNEL=y CONFIG_IPV6_GRE=y CONFIG_IPV6_SEG6_BPF=y -CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT=y CONFIG_IPV6_TUNNEL=y CONFIG_LIRC=y CONFIG_LWTUNNEL=y CONFIG_MPLS=y -CONFIG_MPLS_IPTUNNEL=m -CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=y +CONFIG_MPLS_ROUTING=y CONFIG_MPTCP=y CONFIG_NET_CLS_ACT=y CONFIG_NET_CLS_BPF=y -CONFIG_NET_CLS_FLOWER=m -CONFIG_NET_FOU=m +CONFIG_NET_CLS_FLOWER=y +CONFIG_NET_FOU=y CONFIG_NET_FOU_IP_TUNNELS=y CONFIG_NET_IPGRE=y CONFIG_NET_IPGRE_DEMUX=y CONFIG_NET_IPIP=y -CONFIG_NET_MPLS_GSO=m +CONFIG_NET_MPLS_GSO=y CONFIG_NET_SCH_INGRESS=y CONFIG_NET_SCHED=y -CONFIG_NETDEVSIM=m +CONFIG_NETDEVSIM=y CONFIG_NETFILTER=y CONFIG_NETFILTER_SYNPROXY=y CONFIG_NETFILTER_XT_CONNMARK=y @@ -60,7 +60,7 @@ CONFIG_NF_DEFRAG_IPV6=y CONFIG_RC_CORE=y CONFIG_SECURITY=y CONFIG_SECURITYFS=y -CONFIG_TEST_BPF=m +CONFIG_TEST_BPF=y CONFIG_USERFAULTFD=y CONFIG_VXLAN=y CONFIG_XDP_SOCKETS=y diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64 index f0859a1d37ab..ce70c9509204 100644 --- a/tools/testing/selftests/bpf/config.x86_64 +++ b/tools/testing/selftests/bpf/config.x86_64 @@ -47,7 +47,7 @@ CONFIG_CPU_IDLE_GOV_LADDER=y CONFIG_CPUSETS=y CONFIG_CRC_T10DIF=y CONFIG_CRYPTO_BLAKE2B=y -CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_CRYPTO_SEQIV=y CONFIG_CRYPTO_XXHASH=y CONFIG_DCB=y From 7c036ed9e0065e852fb1886d9ea97ceb35680e3f Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:48 +0200 Subject: [PATCH 070/143] selftests/bpf: Add verifier tests for bpf_lookup_*_key() and bpf_key_put() Add verifier tests for bpf_lookup_*_key() and bpf_key_put(), to ensure that acquired key references stored in the bpf_key structure are released, that a non-NULL bpf_key pointer is passed to bpf_key_put(), and that key references are not leaked. Also, slightly modify test_verifier.c, to find the BTF ID of the attach point for the LSM program type (currently, it is done only for TRACING). Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-11-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/config | 1 + tools/testing/selftests/bpf/test_verifier.c | 3 +- .../selftests/bpf/verifier/ref_tracking.c | 139 ++++++++++++++++++ 3 files changed, 142 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 0fdd11e6b742..add5a5a919b4 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -30,6 +30,7 @@ CONFIG_IPV6_GRE=y CONFIG_IPV6_SEG6_BPF=y CONFIG_IPV6_SIT=y CONFIG_IPV6_TUNNEL=y +CONFIG_KEYS=y CONFIG_LIRC=y CONFIG_LWTUNNEL=y CONFIG_MPLS=y diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index f9d553fbf68a..2dbcbf363c18 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1498,7 +1498,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv, opts.log_level = DEFAULT_LIBBPF_LOG_LEVEL; opts.prog_flags = pflags; - if (prog_type == BPF_PROG_TYPE_TRACING && test->kfunc) { + if ((prog_type == BPF_PROG_TYPE_TRACING || + prog_type == BPF_PROG_TYPE_LSM) && test->kfunc) { int attach_btf_id; attach_btf_id = libbpf_find_vmlinux_btf_id(test->kfunc, diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 57a83d763ec1..f18ce867271f 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -84,6 +84,145 @@ .errstr = "Unreleased reference", .result = REJECT, }, +{ + "reference tracking: acquire/release user key reference", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, -3), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_LSM, + .kfunc = "bpf", + .expected_attach_type = BPF_LSM_MAC, + .flags = BPF_F_SLEEPABLE, + .fixup_kfunc_btf_id = { + { "bpf_lookup_user_key", 2 }, + { "bpf_key_put", 5 }, + }, + .result = ACCEPT, +}, +{ + "reference tracking: acquire/release system key reference", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_LSM, + .kfunc = "bpf", + .expected_attach_type = BPF_LSM_MAC, + .flags = BPF_F_SLEEPABLE, + .fixup_kfunc_btf_id = { + { "bpf_lookup_system_key", 1 }, + { "bpf_key_put", 4 }, + }, + .result = ACCEPT, +}, +{ + "reference tracking: release user key reference without check", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, -3), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_LSM, + .kfunc = "bpf", + .expected_attach_type = BPF_LSM_MAC, + .flags = BPF_F_SLEEPABLE, + .errstr = "arg#0 pointer type STRUCT bpf_key must point to scalar, or struct with scalar", + .fixup_kfunc_btf_id = { + { "bpf_lookup_user_key", 2 }, + { "bpf_key_put", 4 }, + }, + .result = REJECT, +}, +{ + "reference tracking: release system key reference without check", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_LSM, + .kfunc = "bpf", + .expected_attach_type = BPF_LSM_MAC, + .flags = BPF_F_SLEEPABLE, + .errstr = "arg#0 pointer type STRUCT bpf_key must point to scalar, or struct with scalar", + .fixup_kfunc_btf_id = { + { "bpf_lookup_system_key", 1 }, + { "bpf_key_put", 3 }, + }, + .result = REJECT, +}, +{ + "reference tracking: release with NULL key pointer", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_LSM, + .kfunc = "bpf", + .expected_attach_type = BPF_LSM_MAC, + .flags = BPF_F_SLEEPABLE, + .errstr = "arg#0 pointer type STRUCT bpf_key must point to scalar, or struct with scalar", + .fixup_kfunc_btf_id = { + { "bpf_key_put", 1 }, + }, + .result = REJECT, +}, +{ + "reference tracking: leak potential reference to user key", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, -3), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_LSM, + .kfunc = "bpf", + .expected_attach_type = BPF_LSM_MAC, + .flags = BPF_F_SLEEPABLE, + .errstr = "Unreleased reference", + .fixup_kfunc_btf_id = { + { "bpf_lookup_user_key", 2 }, + }, + .result = REJECT, +}, +{ + "reference tracking: leak potential reference to system key", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_LSM, + .kfunc = "bpf", + .expected_attach_type = BPF_LSM_MAC, + .flags = BPF_F_SLEEPABLE, + .errstr = "Unreleased reference", + .fixup_kfunc_btf_id = { + { "bpf_lookup_system_key", 1 }, + }, + .result = REJECT, +}, { "reference tracking: release reference without check", .insns = { From ecce368d6e6d76168be5d8d34b411c69ec367859 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:49 +0200 Subject: [PATCH 071/143] selftests/bpf: Add additional tests for bpf_lookup_*_key() Add a test to ensure that bpf_lookup_user_key() creates a referenced special keyring when the KEY_LOOKUP_CREATE flag is passed to this function. Ensure that the kfunc rejects invalid flags. Ensure that a keyring can be obtained from bpf_lookup_system_key() when one of the pre-determined keyring IDs is provided. The test is currently blacklisted for s390x (JIT does not support calling kernel function). Signed-off-by: Roberto Sassu Link: https://lore.kernel.org/r/20220920075951.929132-12-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + .../selftests/bpf/prog_tests/lookup_key.c | 112 ++++++++++++++++++ .../selftests/bpf/progs/test_lookup_key.c | 46 +++++++ 3 files changed, 159 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/lookup_key.c create mode 100644 tools/testing/selftests/bpf/progs/test_lookup_key.c diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 981c2be922f4..a6ac5dce7856 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -72,3 +72,4 @@ cgroup_hierarchical_stats # JIT does not support calling kernel f htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) tracing_struct # failed to auto-attach: -524 (trampoline) user_ringbuf # failed to find kernel BTF type ID of '__s390x_sys_prctl': -3 (?) +lookup_key # JIT does not support calling kernel function (kfunc) diff --git a/tools/testing/selftests/bpf/prog_tests/lookup_key.c b/tools/testing/selftests/bpf/prog_tests/lookup_key.c new file mode 100644 index 000000000000..68025e88f352 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lookup_key.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH + * + * Author: Roberto Sassu + */ + +#include +#include + +#include "test_lookup_key.skel.h" + +#define KEY_LOOKUP_CREATE 0x01 +#define KEY_LOOKUP_PARTIAL 0x02 + +static bool kfunc_not_supported; + +static int libbpf_print_cb(enum libbpf_print_level level, const char *fmt, + va_list args) +{ + char *func; + + if (strcmp(fmt, "libbpf: extern (func ksym) '%s': not found in kernel or module BTFs\n")) + return 0; + + func = va_arg(args, char *); + + if (strcmp(func, "bpf_lookup_user_key") && strcmp(func, "bpf_key_put") && + strcmp(func, "bpf_lookup_system_key")) + return 0; + + kfunc_not_supported = true; + return 0; +} + +void test_lookup_key(void) +{ + libbpf_print_fn_t old_print_cb; + struct test_lookup_key *skel; + __u32 next_id; + int ret; + + skel = test_lookup_key__open(); + if (!ASSERT_OK_PTR(skel, "test_lookup_key__open")) + return; + + old_print_cb = libbpf_set_print(libbpf_print_cb); + ret = test_lookup_key__load(skel); + libbpf_set_print(old_print_cb); + + if (ret < 0 && kfunc_not_supported) { + printf("%s:SKIP:bpf_lookup_*_key(), bpf_key_put() kfuncs not supported\n", + __func__); + test__skip(); + goto close_prog; + } + + if (!ASSERT_OK(ret, "test_lookup_key__load")) + goto close_prog; + + ret = test_lookup_key__attach(skel); + if (!ASSERT_OK(ret, "test_lookup_key__attach")) + goto close_prog; + + skel->bss->monitored_pid = getpid(); + skel->bss->key_serial = KEY_SPEC_THREAD_KEYRING; + + /* The thread-specific keyring does not exist, this test fails. */ + skel->bss->flags = 0; + + ret = bpf_prog_get_next_id(0, &next_id); + if (!ASSERT_LT(ret, 0, "bpf_prog_get_next_id")) + goto close_prog; + + /* Force creation of the thread-specific keyring, this test succeeds. */ + skel->bss->flags = KEY_LOOKUP_CREATE; + + ret = bpf_prog_get_next_id(0, &next_id); + if (!ASSERT_OK(ret, "bpf_prog_get_next_id")) + goto close_prog; + + /* Pass both lookup flags for parameter validation. */ + skel->bss->flags = KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL; + + ret = bpf_prog_get_next_id(0, &next_id); + if (!ASSERT_OK(ret, "bpf_prog_get_next_id")) + goto close_prog; + + /* Pass invalid flags. */ + skel->bss->flags = UINT64_MAX; + + ret = bpf_prog_get_next_id(0, &next_id); + if (!ASSERT_LT(ret, 0, "bpf_prog_get_next_id")) + goto close_prog; + + skel->bss->key_serial = 0; + skel->bss->key_id = 1; + + ret = bpf_prog_get_next_id(0, &next_id); + if (!ASSERT_OK(ret, "bpf_prog_get_next_id")) + goto close_prog; + + skel->bss->key_id = UINT32_MAX; + + ret = bpf_prog_get_next_id(0, &next_id); + ASSERT_LT(ret, 0, "bpf_prog_get_next_id"); + +close_prog: + skel->bss->monitored_pid = 0; + test_lookup_key__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_lookup_key.c b/tools/testing/selftests/bpf/progs/test_lookup_key.c new file mode 100644 index 000000000000..c73776990ae3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_lookup_key.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH + * + * Author: Roberto Sassu + */ + +#include "vmlinux.h" +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u32 monitored_pid; +__u32 key_serial; +__u32 key_id; +__u64 flags; + +extern struct bpf_key *bpf_lookup_user_key(__u32 serial, __u64 flags) __ksym; +extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym; +extern void bpf_key_put(struct bpf_key *key) __ksym; + +SEC("lsm.s/bpf") +int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size) +{ + struct bpf_key *bkey; + __u32 pid; + + pid = bpf_get_current_pid_tgid() >> 32; + if (pid != monitored_pid) + return 0; + + if (key_serial) + bkey = bpf_lookup_user_key(key_serial, flags); + else + bkey = bpf_lookup_system_key(key_id); + + if (!bkey) + return -ENOENT; + + bpf_key_put(bkey); + + return 0; +} From fc97590668ae60b94ad8bc4d9e85958f10cb3567 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:50 +0200 Subject: [PATCH 072/143] selftests/bpf: Add test for bpf_verify_pkcs7_signature() kfunc Perform several tests to ensure the correct implementation of the bpf_verify_pkcs7_signature() kfunc. Do the tests with data signed with a generated testing key (by using sign-file from scripts/) and with the tcp_bic.ko kernel module if it is found in the system. The test does not fail if tcp_bic.ko is not found. First, perform an unsuccessful signature verification without data. Second, perform a successful signature verification with the session keyring and a new one created for testing. Then, ensure that permission and validation checks are done properly on the keyring provided to bpf_verify_pkcs7_signature(), despite those checks were deferred at the time the keyring was retrieved with bpf_lookup_user_key(). The tests expect to encounter an error if the Search permission is removed from the keyring, or the keyring is expired. Finally, perform a successful and unsuccessful signature verification with the keyrings with pre-determined IDs (the last test fails because the key is not in the platform keyring). The test is currently in the deny list for s390x (JIT does not support calling kernel function). Signed-off-by: Roberto Sassu Link: https://lore.kernel.org/r/20220920075951.929132-13-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + tools/testing/selftests/bpf/Makefile | 14 +- tools/testing/selftests/bpf/config | 5 + tools/testing/selftests/bpf/config.x86_64 | 5 - .../bpf/prog_tests/verify_pkcs7_sig.c | 399 ++++++++++++++++++ .../bpf/progs/test_verify_pkcs7_sig.c | 90 ++++ .../testing/selftests/bpf/verify_sig_setup.sh | 104 +++++ 7 files changed, 610 insertions(+), 8 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c create mode 100644 tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c create mode 100755 tools/testing/selftests/bpf/verify_sig_setup.sh diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index a6ac5dce7856..a992fd978c1e 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -73,3 +73,4 @@ htab_update # failed to attach: ERROR: strerror_r(- tracing_struct # failed to auto-attach: -524 (trampoline) user_ringbuf # failed to find kernel BTF type ID of '__s390x_sys_prctl': -3 (?) lookup_key # JIT does not support calling kernel function (kfunc) +verify_pkcs7_sig # JIT does not support calling kernel function (kfunc) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1a0296bd744a..5898d3828b82 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -14,6 +14,7 @@ BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool APIDIR := $(TOOLSINCDIR)/uapi GENDIR := $(abspath ../../../../include/generated) GENHDR := $(GENDIR)/autoconf.h +HOSTPKG_CONFIG := pkg-config ifneq ($(wildcard $(GENHDR)),) GENFLAGS := -DHAVE_GENHDR @@ -75,7 +76,7 @@ TEST_PROGS := test_kmod.sh \ test_xsk.sh TEST_PROGS_EXTENDED := with_addr.sh \ - with_tunnels.sh ima_setup.sh \ + with_tunnels.sh ima_setup.sh verify_sig_setup.sh \ test_xdp_vlan.sh test_bpftool.py # Compile but not part of 'make run_tests' @@ -84,7 +85,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ xskxceiver xdp_redirect_multi xdp_synproxy veristat -TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read +TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/sign-file # Emit succinct information message describing current building step # $1 - generic step name (e.g., CC, LINK, etc); @@ -189,6 +190,12 @@ $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_r -fuse-ld=$(LLD) -Wl,-znoseparate-code \ -Wl,-rpath=. -Wl,--build-id=sha1 -o $@ +$(OUTPUT)/sign-file: ../../../../scripts/sign-file.c + $(call msg,SIGN-FILE,,$@) + $(Q)$(CC) $(shell $(HOSTPKG_CONFIG)--cflags libcrypto 2> /dev/null) \ + $< -o $@ \ + $(shell $(HOSTPKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto) + $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch]) $(call msg,MOD,,$@) $(Q)$(RM) bpf_testmod/bpf_testmod.ko # force re-compilation @@ -516,7 +523,8 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(OUTPUT)/liburandom_read.so \ $(OUTPUT)/xdp_synproxy \ - ima_setup.sh \ + $(OUTPUT)/sign-file \ + ima_setup.sh verify_sig_setup.sh \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index add5a5a919b4..905a9be8d0a2 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -33,6 +33,11 @@ CONFIG_IPV6_TUNNEL=y CONFIG_KEYS=y CONFIG_LIRC=y CONFIG_LWTUNNEL=y +CONFIG_MODULE_SIG=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULES=y +CONFIG_MODVERSIONS=y CONFIG_MPLS=y CONFIG_MPLS_IPTUNNEL=y CONFIG_MPLS_ROUTING=y diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64 index ce70c9509204..21ce5ea4304e 100644 --- a/tools/testing/selftests/bpf/config.x86_64 +++ b/tools/testing/selftests/bpf/config.x86_64 @@ -145,11 +145,6 @@ CONFIG_MCORE2=y CONFIG_MEMCG=y CONFIG_MEMORY_FAILURE=y CONFIG_MINIX_SUBPARTITION=y -CONFIG_MODULE_SIG=y -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULES=y -CONFIG_MODVERSIONS=y CONFIG_NAMESPACES=y CONFIG_NET=y CONFIG_NET_9P=y diff --git a/tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c b/tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c new file mode 100644 index 000000000000..579d6ee83ce0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH + * + * Author: Roberto Sassu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_verify_pkcs7_sig.skel.h" + +#define MAX_DATA_SIZE (1024 * 1024) +#define MAX_SIG_SIZE 1024 + +#define VERIFY_USE_SECONDARY_KEYRING (1UL) +#define VERIFY_USE_PLATFORM_KEYRING (2UL) + +/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ +#define MODULE_SIG_STRING "~Module signature appended~\n" + +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + * - Signer's name + * - Key identifier + * - Signature data + * - Information block + */ +struct module_signature { + __u8 algo; /* Public-key crypto algorithm [0] */ + __u8 hash; /* Digest algorithm [0] */ + __u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + __u8 signer_len; /* Length of signer's name [0] */ + __u8 key_id_len; /* Length of key identifier [0] */ + __u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ +}; + +struct data { + __u8 data[MAX_DATA_SIZE]; + __u32 data_len; + __u8 sig[MAX_SIG_SIZE]; + __u32 sig_len; +}; + +static bool kfunc_not_supported; + +static int libbpf_print_cb(enum libbpf_print_level level, const char *fmt, + va_list args) +{ + if (strcmp(fmt, "libbpf: extern (func ksym) '%s': not found in kernel or module BTFs\n")) + return 0; + + if (strcmp(va_arg(args, char *), "bpf_verify_pkcs7_signature")) + return 0; + + kfunc_not_supported = true; + return 0; +} + +static int _run_setup_process(const char *setup_dir, const char *cmd) +{ + int child_pid, child_status; + + child_pid = fork(); + if (child_pid == 0) { + execlp("./verify_sig_setup.sh", "./verify_sig_setup.sh", cmd, + setup_dir, NULL); + exit(errno); + + } else if (child_pid > 0) { + waitpid(child_pid, &child_status, 0); + return WEXITSTATUS(child_status); + } + + return -EINVAL; +} + +static int populate_data_item_str(const char *tmp_dir, struct data *data_item) +{ + struct stat st; + char data_template[] = "/tmp/dataXXXXXX"; + char path[PATH_MAX]; + int ret, fd, child_status, child_pid; + + data_item->data_len = 4; + memcpy(data_item->data, "test", data_item->data_len); + + fd = mkstemp(data_template); + if (fd == -1) + return -errno; + + ret = write(fd, data_item->data, data_item->data_len); + + close(fd); + + if (ret != data_item->data_len) { + ret = -EIO; + goto out; + } + + child_pid = fork(); + + if (child_pid == -1) { + ret = -errno; + goto out; + } + + if (child_pid == 0) { + snprintf(path, sizeof(path), "%s/signing_key.pem", tmp_dir); + + return execlp("./sign-file", "./sign-file", "-d", "sha256", + path, path, data_template, NULL); + } + + waitpid(child_pid, &child_status, 0); + + ret = WEXITSTATUS(child_status); + if (ret) + goto out; + + snprintf(path, sizeof(path), "%s.p7s", data_template); + + ret = stat(path, &st); + if (ret == -1) { + ret = -errno; + goto out; + } + + if (st.st_size > sizeof(data_item->sig)) { + ret = -EINVAL; + goto out_sig; + } + + data_item->sig_len = st.st_size; + + fd = open(path, O_RDONLY); + if (fd == -1) { + ret = -errno; + goto out_sig; + } + + ret = read(fd, data_item->sig, data_item->sig_len); + + close(fd); + + if (ret != data_item->sig_len) { + ret = -EIO; + goto out_sig; + } + + ret = 0; +out_sig: + unlink(path); +out: + unlink(data_template); + return ret; +} + +static int populate_data_item_mod(struct data *data_item) +{ + char mod_path[PATH_MAX], *mod_path_ptr; + struct stat st; + void *mod; + FILE *fp; + struct module_signature ms; + int ret, fd, modlen, marker_len, sig_len; + + data_item->data_len = 0; + + if (stat("/lib/modules", &st) == -1) + return 0; + + /* Requires CONFIG_TCP_CONG_BIC=m. */ + fp = popen("find /lib/modules/$(uname -r) -name tcp_bic.ko", "r"); + if (!fp) + return 0; + + mod_path_ptr = fgets(mod_path, sizeof(mod_path), fp); + pclose(fp); + + if (!mod_path_ptr) + return 0; + + mod_path_ptr = strchr(mod_path, '\n'); + if (!mod_path_ptr) + return 0; + + *mod_path_ptr = '\0'; + + if (stat(mod_path, &st) == -1) + return 0; + + modlen = st.st_size; + marker_len = sizeof(MODULE_SIG_STRING) - 1; + + fd = open(mod_path, O_RDONLY); + if (fd == -1) + return -errno; + + mod = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + + close(fd); + + if (mod == MAP_FAILED) + return -errno; + + if (strncmp(mod + modlen - marker_len, MODULE_SIG_STRING, marker_len)) { + ret = -EINVAL; + goto out; + } + + modlen -= marker_len; + + memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); + + sig_len = __be32_to_cpu(ms.sig_len); + modlen -= sig_len + sizeof(ms); + + if (modlen > sizeof(data_item->data)) { + ret = -E2BIG; + goto out; + } + + memcpy(data_item->data, mod, modlen); + data_item->data_len = modlen; + + if (sig_len > sizeof(data_item->sig)) { + ret = -E2BIG; + goto out; + } + + memcpy(data_item->sig, mod + modlen, sig_len); + data_item->sig_len = sig_len; + ret = 0; +out: + munmap(mod, st.st_size); + return ret; +} + +void test_verify_pkcs7_sig(void) +{ + libbpf_print_fn_t old_print_cb; + char tmp_dir_template[] = "/tmp/verify_sigXXXXXX"; + char *tmp_dir; + struct test_verify_pkcs7_sig *skel = NULL; + struct bpf_map *map; + struct data data; + int ret, zero = 0; + + /* Trigger creation of session keyring. */ + syscall(__NR_request_key, "keyring", "_uid.0", NULL, + KEY_SPEC_SESSION_KEYRING); + + tmp_dir = mkdtemp(tmp_dir_template); + if (!ASSERT_OK_PTR(tmp_dir, "mkdtemp")) + return; + + ret = _run_setup_process(tmp_dir, "setup"); + if (!ASSERT_OK(ret, "_run_setup_process")) + goto close_prog; + + skel = test_verify_pkcs7_sig__open(); + if (!ASSERT_OK_PTR(skel, "test_verify_pkcs7_sig__open")) + goto close_prog; + + old_print_cb = libbpf_set_print(libbpf_print_cb); + ret = test_verify_pkcs7_sig__load(skel); + libbpf_set_print(old_print_cb); + + if (ret < 0 && kfunc_not_supported) { + printf( + "%s:SKIP:bpf_verify_pkcs7_signature() kfunc not supported\n", + __func__); + test__skip(); + goto close_prog; + } + + if (!ASSERT_OK(ret, "test_verify_pkcs7_sig__load")) + goto close_prog; + + ret = test_verify_pkcs7_sig__attach(skel); + if (!ASSERT_OK(ret, "test_verify_pkcs7_sig__attach")) + goto close_prog; + + map = bpf_object__find_map_by_name(skel->obj, "data_input"); + if (!ASSERT_OK_PTR(map, "data_input not found")) + goto close_prog; + + skel->bss->monitored_pid = getpid(); + + /* Test without data and signature. */ + skel->bss->user_keyring_serial = KEY_SPEC_SESSION_KEYRING; + + ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY); + if (!ASSERT_LT(ret, 0, "bpf_map_update_elem data_input")) + goto close_prog; + + /* Test successful signature verification with session keyring. */ + ret = populate_data_item_str(tmp_dir, &data); + if (!ASSERT_OK(ret, "populate_data_item_str")) + goto close_prog; + + ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY); + if (!ASSERT_OK(ret, "bpf_map_update_elem data_input")) + goto close_prog; + + /* Test successful signature verification with testing keyring. */ + skel->bss->user_keyring_serial = syscall(__NR_request_key, "keyring", + "ebpf_testing_keyring", NULL, + KEY_SPEC_SESSION_KEYRING); + + ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY); + if (!ASSERT_OK(ret, "bpf_map_update_elem data_input")) + goto close_prog; + + /* + * Ensure key_task_permission() is called and rejects the keyring + * (no Search permission). + */ + syscall(__NR_keyctl, KEYCTL_SETPERM, skel->bss->user_keyring_serial, + 0x37373737); + + ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY); + if (!ASSERT_LT(ret, 0, "bpf_map_update_elem data_input")) + goto close_prog; + + syscall(__NR_keyctl, KEYCTL_SETPERM, skel->bss->user_keyring_serial, + 0x3f3f3f3f); + + /* + * Ensure key_validate() is called and rejects the keyring (key expired) + */ + syscall(__NR_keyctl, KEYCTL_SET_TIMEOUT, + skel->bss->user_keyring_serial, 1); + sleep(1); + + ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY); + if (!ASSERT_LT(ret, 0, "bpf_map_update_elem data_input")) + goto close_prog; + + skel->bss->user_keyring_serial = KEY_SPEC_SESSION_KEYRING; + + /* Test with corrupted data (signature verification should fail). */ + data.data[0] = 'a'; + ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY); + if (!ASSERT_LT(ret, 0, "bpf_map_update_elem data_input")) + goto close_prog; + + ret = populate_data_item_mod(&data); + if (!ASSERT_OK(ret, "populate_data_item_mod")) + goto close_prog; + + /* Test signature verification with system keyrings. */ + if (data.data_len) { + skel->bss->user_keyring_serial = 0; + skel->bss->system_keyring_id = 0; + + ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, + BPF_ANY); + if (!ASSERT_OK(ret, "bpf_map_update_elem data_input")) + goto close_prog; + + skel->bss->system_keyring_id = VERIFY_USE_SECONDARY_KEYRING; + + ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, + BPF_ANY); + if (!ASSERT_OK(ret, "bpf_map_update_elem data_input")) + goto close_prog; + + skel->bss->system_keyring_id = VERIFY_USE_PLATFORM_KEYRING; + + ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, + BPF_ANY); + ASSERT_LT(ret, 0, "bpf_map_update_elem data_input"); + } + +close_prog: + _run_setup_process(tmp_dir, "cleanup"); + + if (!skel) + return; + + skel->bss->monitored_pid = 0; + test_verify_pkcs7_sig__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c new file mode 100644 index 000000000000..ce419304ff1f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH + * + * Author: Roberto Sassu + */ + +#include "vmlinux.h" +#include +#include +#include + +#define MAX_DATA_SIZE (1024 * 1024) +#define MAX_SIG_SIZE 1024 + +extern struct bpf_key *bpf_lookup_user_key(__u32 serial, __u64 flags) __ksym; +extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym; +extern void bpf_key_put(struct bpf_key *key) __ksym; +extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, + struct bpf_dynptr *sig_ptr, + struct bpf_key *trusted_keyring) __ksym; + +__u32 monitored_pid; +__u32 user_keyring_serial; +__u64 system_keyring_id; + +struct data { + __u8 data[MAX_DATA_SIZE]; + __u32 data_len; + __u8 sig[MAX_SIG_SIZE]; + __u32 sig_len; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct data); +} data_input SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +SEC("lsm.s/bpf") +int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size) +{ + struct bpf_dynptr data_ptr, sig_ptr; + struct data *data_val; + struct bpf_key *trusted_keyring; + __u32 pid; + __u64 value; + int ret, zero = 0; + + pid = bpf_get_current_pid_tgid() >> 32; + if (pid != monitored_pid) + return 0; + + data_val = bpf_map_lookup_elem(&data_input, &zero); + if (!data_val) + return 0; + + bpf_probe_read(&value, sizeof(value), &attr->value); + + bpf_copy_from_user(data_val, sizeof(struct data), + (void *)(unsigned long)value); + + if (data_val->data_len > sizeof(data_val->data)) + return -EINVAL; + + bpf_dynptr_from_mem(data_val->data, data_val->data_len, 0, &data_ptr); + + if (data_val->sig_len > sizeof(data_val->sig)) + return -EINVAL; + + bpf_dynptr_from_mem(data_val->sig, data_val->sig_len, 0, &sig_ptr); + + if (user_keyring_serial) + trusted_keyring = bpf_lookup_user_key(user_keyring_serial, 0); + else + trusted_keyring = bpf_lookup_system_key(system_keyring_id); + + if (!trusted_keyring) + return -ENOENT; + + ret = bpf_verify_pkcs7_signature(&data_ptr, &sig_ptr, trusted_keyring); + + bpf_key_put(trusted_keyring); + + return ret; +} diff --git a/tools/testing/selftests/bpf/verify_sig_setup.sh b/tools/testing/selftests/bpf/verify_sig_setup.sh new file mode 100755 index 000000000000..ba08922b4a27 --- /dev/null +++ b/tools/testing/selftests/bpf/verify_sig_setup.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e +set -u +set -o pipefail + +VERBOSE="${SELFTESTS_VERBOSE:=0}" +LOG_FILE="$(mktemp /tmp/verify_sig_setup.log.XXXXXX)" + +x509_genkey_content="\ +[ req ] +default_bits = 2048 +distinguished_name = req_distinguished_name +prompt = no +string_mask = utf8only +x509_extensions = myexts + +[ req_distinguished_name ] +CN = eBPF Signature Verification Testing Key + +[ myexts ] +basicConstraints=critical,CA:FALSE +keyUsage=digitalSignature +subjectKeyIdentifier=hash +authorityKeyIdentifier=keyid +" + +usage() +{ + echo "Usage: $0 " + exit 1 +} + +setup() +{ + local tmp_dir="$1" + + echo "${x509_genkey_content}" > ${tmp_dir}/x509.genkey + + openssl req -new -nodes -utf8 -sha256 -days 36500 \ + -batch -x509 -config ${tmp_dir}/x509.genkey \ + -outform PEM -out ${tmp_dir}/signing_key.pem \ + -keyout ${tmp_dir}/signing_key.pem 2>&1 + + openssl x509 -in ${tmp_dir}/signing_key.pem -out \ + ${tmp_dir}/signing_key.der -outform der + + key_id=$(cat ${tmp_dir}/signing_key.der | keyctl padd asymmetric ebpf_testing_key @s) + + keyring_id=$(keyctl newring ebpf_testing_keyring @s) + keyctl link $key_id $keyring_id +} + +cleanup() { + local tmp_dir="$1" + + keyctl unlink $(keyctl search @s asymmetric ebpf_testing_key) @s + keyctl unlink $(keyctl search @s keyring ebpf_testing_keyring) @s + rm -rf ${tmp_dir} +} + +catch() +{ + local exit_code="$1" + local log_file="$2" + + if [[ "${exit_code}" -ne 0 ]]; then + cat "${log_file}" >&3 + fi + + rm -f "${log_file}" + exit ${exit_code} +} + +main() +{ + [[ $# -ne 2 ]] && usage + + local action="$1" + local tmp_dir="$2" + + [[ ! -d "${tmp_dir}" ]] && echo "Directory ${tmp_dir} doesn't exist" && exit 1 + + if [[ "${action}" == "setup" ]]; then + setup "${tmp_dir}" + elif [[ "${action}" == "cleanup" ]]; then + cleanup "${tmp_dir}" + else + echo "Unknown action: ${action}" + exit 1 + fi +} + +trap 'catch "$?" "${LOG_FILE}"' EXIT + +if [[ "${VERBOSE}" -eq 0 ]]; then + # Save the stderr to 3 so that we can output back to + # it incase of an error. + exec 3>&2 1>"${LOG_FILE}" 2>&1 +fi + +main "$@" +rm -f "${LOG_FILE}" From b94fa9f9dcf99730eabd8febc4c95e44342bfb59 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:51 +0200 Subject: [PATCH 073/143] selftests/bpf: Add tests for dynamic pointers parameters in kfuncs Add tests to ensure that only supported dynamic pointer types are accepted, that the passed argument is actually a dynamic pointer, that the passed argument is a pointer to the stack, and that bpf_verify_pkcs7_signature() correctly handles dynamic pointers with data set to NULL. The tests are currently in the deny list for s390x (JIT does not support calling kernel function). Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-14-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + .../bpf/prog_tests/kfunc_dynptr_param.c | 164 ++++++++++++++++++ .../bpf/progs/test_kfunc_dynptr_param.c | 94 ++++++++++ 3 files changed, 259 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c create mode 100644 tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index a992fd978c1e..17e074eb42b8 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -74,3 +74,4 @@ tracing_struct # failed to auto-attach: -524 user_ringbuf # failed to find kernel BTF type ID of '__s390x_sys_prctl': -3 (?) lookup_key # JIT does not support calling kernel function (kfunc) verify_pkcs7_sig # JIT does not support calling kernel function (kfunc) +kfunc_dynptr_param # JIT does not support calling kernel function (kfunc) diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c new file mode 100644 index 000000000000..c210657d4d0a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2022 Facebook + * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH + * + * Author: Roberto Sassu + */ + +#include +#include "test_kfunc_dynptr_param.skel.h" + +static size_t log_buf_sz = 1048576; /* 1 MB */ +static char obj_log_buf[1048576]; + +static struct { + const char *prog_name; + const char *expected_verifier_err_msg; + int expected_runtime_err; +} kfunc_dynptr_tests[] = { + {"dynptr_type_not_supp", + "arg#0 pointer type STRUCT bpf_dynptr_kern points to unsupported dynamic pointer type", 0}, + {"not_valid_dynptr", + "arg#0 pointer type STRUCT bpf_dynptr_kern must be valid and initialized", 0}, + {"not_ptr_to_stack", "arg#0 pointer type STRUCT bpf_dynptr_kern not to stack", 0}, + {"dynptr_data_null", NULL, -EBADMSG}, +}; + +static bool kfunc_not_supported; + +static int libbpf_print_cb(enum libbpf_print_level level, const char *fmt, + va_list args) +{ + if (strcmp(fmt, "libbpf: extern (func ksym) '%s': not found in kernel or module BTFs\n")) + return 0; + + if (strcmp(va_arg(args, char *), "bpf_verify_pkcs7_signature")) + return 0; + + kfunc_not_supported = true; + return 0; +} + +static void verify_fail(const char *prog_name, const char *expected_err_msg) +{ + struct test_kfunc_dynptr_param *skel; + LIBBPF_OPTS(bpf_object_open_opts, opts); + libbpf_print_fn_t old_print_cb; + struct bpf_program *prog; + int err; + + opts.kernel_log_buf = obj_log_buf; + opts.kernel_log_size = log_buf_sz; + opts.kernel_log_level = 1; + + skel = test_kfunc_dynptr_param__open_opts(&opts); + if (!ASSERT_OK_PTR(skel, "test_kfunc_dynptr_param__open_opts")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; + + bpf_program__set_autoload(prog, true); + + bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize()); + + kfunc_not_supported = false; + + old_print_cb = libbpf_set_print(libbpf_print_cb); + err = test_kfunc_dynptr_param__load(skel); + libbpf_set_print(old_print_cb); + + if (err < 0 && kfunc_not_supported) { + fprintf(stderr, + "%s:SKIP:bpf_verify_pkcs7_signature() kfunc not supported\n", + __func__); + test__skip(); + goto cleanup; + } + + if (!ASSERT_ERR(err, "unexpected load success")) + goto cleanup; + + if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) { + fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg); + fprintf(stderr, "Verifier output: %s\n", obj_log_buf); + } + +cleanup: + test_kfunc_dynptr_param__destroy(skel); +} + +static void verify_success(const char *prog_name, int expected_runtime_err) +{ + struct test_kfunc_dynptr_param *skel; + libbpf_print_fn_t old_print_cb; + struct bpf_program *prog; + struct bpf_link *link; + __u32 next_id; + int err; + + skel = test_kfunc_dynptr_param__open(); + if (!ASSERT_OK_PTR(skel, "test_kfunc_dynptr_param__open")) + return; + + skel->bss->pid = getpid(); + + bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize()); + + kfunc_not_supported = false; + + old_print_cb = libbpf_set_print(libbpf_print_cb); + err = test_kfunc_dynptr_param__load(skel); + libbpf_set_print(old_print_cb); + + if (err < 0 && kfunc_not_supported) { + fprintf(stderr, + "%s:SKIP:bpf_verify_pkcs7_signature() kfunc not supported\n", + __func__); + test__skip(); + goto cleanup; + } + + if (!ASSERT_OK(err, "test_kfunc_dynptr_param__load")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; + + link = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto cleanup; + + err = bpf_prog_get_next_id(0, &next_id); + + bpf_link__destroy(link); + + if (!ASSERT_OK(err, "bpf_prog_get_next_id")) + goto cleanup; + + ASSERT_EQ(skel->bss->err, expected_runtime_err, "err"); + +cleanup: + test_kfunc_dynptr_param__destroy(skel); +} + +void test_kfunc_dynptr_param(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(kfunc_dynptr_tests); i++) { + if (!test__start_subtest(kfunc_dynptr_tests[i].prog_name)) + continue; + + if (kfunc_dynptr_tests[i].expected_verifier_err_msg) + verify_fail(kfunc_dynptr_tests[i].prog_name, + kfunc_dynptr_tests[i].expected_verifier_err_msg); + else + verify_success(kfunc_dynptr_tests[i].prog_name, + kfunc_dynptr_tests[i].expected_runtime_err); + } +} diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c new file mode 100644 index 000000000000..ce39d096bba3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH + * + * Author: Roberto Sassu + */ + +#include "vmlinux.h" +#include +#include +#include + +extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym; +extern void bpf_key_put(struct bpf_key *key) __ksym; +extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, + struct bpf_dynptr *sig_ptr, + struct bpf_key *trusted_keyring) __ksym; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); +} array_map SEC(".maps"); + +int err, pid; + +char _license[] SEC("license") = "GPL"; + +SEC("?lsm.s/bpf") +int BPF_PROG(dynptr_type_not_supp, int cmd, union bpf_attr *attr, + unsigned int size) +{ + char write_data[64] = "hello there, world!!"; + struct bpf_dynptr ptr; + + bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr); + + return bpf_verify_pkcs7_signature(&ptr, &ptr, NULL); +} + +SEC("?lsm.s/bpf") +int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size) +{ + unsigned long val; + + return bpf_verify_pkcs7_signature((struct bpf_dynptr *)&val, + (struct bpf_dynptr *)&val, NULL); +} + +SEC("?lsm.s/bpf") +int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size) +{ + unsigned long val; + + return bpf_verify_pkcs7_signature((struct bpf_dynptr *)val, + (struct bpf_dynptr *)val, NULL); +} + +SEC("lsm.s/bpf") +int BPF_PROG(dynptr_data_null, int cmd, union bpf_attr *attr, unsigned int size) +{ + struct bpf_key *trusted_keyring; + struct bpf_dynptr ptr; + __u32 *value; + int ret, zero = 0; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + value = bpf_map_lookup_elem(&array_map, &zero); + if (!value) + return 0; + + /* Pass invalid flags. */ + ret = bpf_dynptr_from_mem(value, sizeof(*value), ((__u64)~0ULL), &ptr); + if (ret != -EINVAL) + return 0; + + trusted_keyring = bpf_lookup_system_key(0); + if (!trusted_keyring) + return 0; + + err = bpf_verify_pkcs7_signature(&ptr, &ptr, trusted_keyring); + + bpf_key_put(trusted_keyring); + + return 0; +} From 05b24ff9b2cfabfcfd951daaa915a036ab53c9e1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 16 Sep 2022 09:19:14 +0200 Subject: [PATCH 074/143] bpf: Prevent bpf program recursion for raw tracepoint probes We got report from sysbot [1] about warnings that were caused by bpf program attached to contention_begin raw tracepoint triggering the same tracepoint by using bpf_trace_printk helper that takes trace_printk_lock lock. Call Trace: ? trace_event_raw_event_bpf_trace_printk+0x5f/0x90 bpf_trace_printk+0x2b/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 __unfreeze_partials+0x5b/0x160 ... The can be reproduced by attaching bpf program as raw tracepoint on contention_begin tracepoint. The bpf prog calls bpf_trace_printk helper. Then by running perf bench the spin lock code is forced to take slow path and call contention_begin tracepoint. Fixing this by skipping execution of the bpf program if it's already running, Using bpf prog 'active' field, which is being currently used by trampoline programs for the same reason. Moving bpf_prog_inc_misses_counter to syscall.c because trampoline.c is compiled in just for CONFIG_BPF_JIT option. Reviewed-by: Stanislav Fomichev Reported-by: syzbot+2251879aa068ad9c960d@syzkaller.appspotmail.com [1] https://lore.kernel.org/bpf/YxhFe3EwqchC%2FfYf@krava/T/#t Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220916071914.7156-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ kernel/bpf/syscall.c | 11 +++++++++++ kernel/bpf/trampoline.c | 15 ++------------- kernel/trace/bpf_trace.c | 6 ++++++ 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a1435b019aca..edd43edb27d6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2042,6 +2042,8 @@ static inline bool has_current_bpf_ctx(void) { return !!current->bpf_ctx; } + +void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2264,6 +2266,10 @@ static inline bool has_current_bpf_ctx(void) { return false; } + +static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dab156f09f8d..372fad5ef3d3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2093,6 +2093,17 @@ struct bpf_prog_kstats { u64 misses; }; +void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) +{ + struct bpf_prog_stats *stats; + unsigned int flags; + + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->misses); + u64_stats_update_end_irqrestore(&stats->syncp, flags); +} + static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_kstats *stats) { diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ad76940b02cc..41b67eb83ab3 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -863,17 +863,6 @@ static __always_inline u64 notrace bpf_prog_start_time(void) return start; } -static void notrace inc_misses_counter(struct bpf_prog *prog) -{ - struct bpf_prog_stats *stats; - unsigned int flags; - - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->misses); - u64_stats_update_end_irqrestore(&stats->syncp, flags); -} - /* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into @@ -896,7 +885,7 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { - inc_misses_counter(prog); + bpf_prog_inc_misses_counter(prog); return 0; } return bpf_prog_start_time(); @@ -967,7 +956,7 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r might_fault(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { - inc_misses_counter(prog); + bpf_prog_inc_misses_counter(prog); return 0; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9df53c40cffd..b05f0310dbd3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2222,9 +2222,15 @@ static __always_inline void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { cant_sleep(); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); + goto out; + } rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); +out: + this_cpu_dec(*(prog->active)); } #define UNPACK(...) __VA_ARGS__ From 1d8b82c613297f24354b4d750413a7456b5cd92c Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 21 Sep 2022 15:38:26 +0800 Subject: [PATCH 075/143] bpf: Always use raw spinlock for hash bucket lock For a non-preallocated hash map on RT kernel, regular spinlock instead of raw spinlock is used for bucket lock. The reason is that on RT kernel memory allocation is forbidden under atomic context and regular spinlock is sleepable under RT. Now hash map has been fully converted to use bpf_map_alloc, and there will be no synchronous memory allocation for non-preallocated hash map, so it is safe to always use raw spinlock for bucket lock on RT. So removing the usage of htab_use_raw_lock() and updating the comments accordingly. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220921073826.2365800-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 66 ++++++++++---------------------------------- 1 file changed, 14 insertions(+), 52 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 86aec20c22d0..ed3f8a53603b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -68,24 +68,16 @@ * In theory the BPF locks could be converted to regular spinlocks as well, * but the bucket locks and percpu_freelist locks can be taken from * arbitrary contexts (perf, kprobes, tracepoints) which are required to be - * atomic contexts even on RT. These mechanisms require preallocated maps, - * so there is no need to invoke memory allocations within the lock held - * sections. - * - * BPF maps which need dynamic allocation are only used from (forced) - * thread context on RT and can therefore use regular spinlocks which in - * turn allows to invoke memory allocations from the lock held section. - * - * On a non RT kernel this distinction is neither possible nor required. - * spinlock maps to raw_spinlock and the extra code is optimized out by the - * compiler. + * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, + * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, + * because there is no memory allocation within the lock held sections. However + * after hash map was fully converted to use bpf_mem_alloc, there will be + * non-synchronous memory allocation for non-preallocated hash map, so it is + * safe to always use raw spinlock for bucket lock. */ struct bucket { struct hlist_nulls_head head; - union { - raw_spinlock_t raw_lock; - spinlock_t lock; - }; + raw_spinlock_t raw_lock; }; #define HASHTAB_MAP_LOCK_COUNT 8 @@ -141,26 +133,15 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab) return !(htab->map.map_flags & BPF_F_NO_PREALLOC); } -static inline bool htab_use_raw_lock(const struct bpf_htab *htab) -{ - return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab)); -} - static void htab_init_buckets(struct bpf_htab *htab) { unsigned int i; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - if (htab_use_raw_lock(htab)) { - raw_spin_lock_init(&htab->buckets[i].raw_lock); - lockdep_set_class(&htab->buckets[i].raw_lock, + raw_spin_lock_init(&htab->buckets[i].raw_lock); + lockdep_set_class(&htab->buckets[i].raw_lock, &htab->lockdep_key); - } else { - spin_lock_init(&htab->buckets[i].lock); - lockdep_set_class(&htab->buckets[i].lock, - &htab->lockdep_key); - } cond_resched(); } } @@ -170,28 +151,17 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, unsigned long *pflags) { unsigned long flags; - bool use_raw_lock; hash = hash & HASHTAB_MAP_LOCK_MASK; - use_raw_lock = htab_use_raw_lock(htab); - if (use_raw_lock) - preempt_disable(); - else - migrate_disable(); + preempt_disable(); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); - if (use_raw_lock) - preempt_enable(); - else - migrate_enable(); + preempt_enable(); return -EBUSY; } - if (use_raw_lock) - raw_spin_lock_irqsave(&b->raw_lock, flags); - else - spin_lock_irqsave(&b->lock, flags); + raw_spin_lock_irqsave(&b->raw_lock, flags); *pflags = flags; return 0; @@ -201,18 +171,10 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, struct bucket *b, u32 hash, unsigned long flags) { - bool use_raw_lock = htab_use_raw_lock(htab); - hash = hash & HASHTAB_MAP_LOCK_MASK; - if (use_raw_lock) - raw_spin_unlock_irqrestore(&b->raw_lock, flags); - else - spin_unlock_irqrestore(&b->lock, flags); + raw_spin_unlock_irqrestore(&b->raw_lock, flags); __this_cpu_dec(*(htab->map_locked[hash])); - if (use_raw_lock) - preempt_enable(); - else - migrate_enable(); + preempt_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); From eed807f626101f6a4227bd53942892c5983b95a7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 21 Sep 2022 18:48:25 +0200 Subject: [PATCH 076/143] bpf: Tweak definition of KF_TRUSTED_ARGS Instead of forcing all arguments to be referenced pointers with non-zero reg->ref_obj_id, tweak the definition of KF_TRUSTED_ARGS to mean that only PTR_TO_BTF_ID (and socket types translated to PTR_TO_BTF_ID) have that constraint, and require their offset to be set to 0. The rest of pointer types are also accomodated in this definition of trusted pointers, but with more relaxed rules regarding offsets. The inherent meaning of setting this flag is that all kfunc pointer arguments have a guranteed lifetime, and kernel object pointers (PTR_TO_BTF_ID, PTR_TO_CTX) are passed in their unmodified form (with offset 0). In general, this is not true for PTR_TO_BTF_ID as it can be obtained using pointer walks. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/cdede0043c47ed7a357f0a915d16f9ce06a1d589.1663778601.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 24 ++++++++++++++++-------- kernel/bpf/btf.c | 18 +++++++++++++----- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 781731749e55..0f858156371d 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -137,14 +137,22 @@ KF_ACQUIRE and KF_RET_NULL flags. -------------------------- The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It -indicates that the all pointer arguments will always be refcounted, and have -their offset set to 0. It can be used to enforce that a pointer to a refcounted -object acquired from a kfunc or BPF helper is passed as an argument to this -kfunc without any modifications (e.g. pointer arithmetic) such that it is -trusted and points to the original object. This flag is often used for kfuncs -that operate (change some property, perform some operation) on an object that -was obtained using an acquire kfunc. Such kfuncs need an unchanged pointer to -ensure the integrity of the operation being performed on the expected object. +indicates that the all pointer arguments will always have a guaranteed lifetime, +and pointers to kernel objects are always passed to helpers in their unmodified +form (as obtained from acquire kfuncs). + +It can be used to enforce that a pointer to a refcounted object acquired from a +kfunc or BPF helper is passed as an argument to this kfunc without any +modifications (e.g. pointer arithmetic) such that it is trusted and points to +the original object. + +Meanwhile, it is also allowed pass pointers to normal memory to such kfuncs, +but those can have a non-zero offset. + +This flag is often used for kfuncs that operate (change some property, perform +some operation) on an object that was obtained using an acquire kfunc. Such +kfuncs need an unchanged pointer to ensure the integrity of the operation being +performed on the expected object. 2.4.6 KF_SLEEPABLE flag ----------------------- diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 13faede0f2b4..a44ad4b347ff 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6227,7 +6227,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, bool processing_call) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); - bool rel = false, kptr_get = false, trusted_arg = false; + bool rel = false, kptr_get = false, trusted_args = false; bool sleepable = false; struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; @@ -6265,7 +6265,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, /* Only kfunc can be release func */ rel = kfunc_meta->flags & KF_RELEASE; kptr_get = kfunc_meta->flags & KF_KPTR_GET; - trusted_arg = kfunc_meta->flags & KF_TRUSTED_ARGS; + trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS; sleepable = kfunc_meta->flags & KF_SLEEPABLE; } @@ -6276,6 +6276,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; + bool obj_ptr = false; t = btf_type_skip_modifiers(btf, args[i].type, NULL); if (btf_type_is_scalar(t)) { @@ -6323,10 +6324,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + /* These register types have special constraints wrt ref_obj_id + * and offset checks. The rest of trusted args don't. + */ + obj_ptr = reg->type == PTR_TO_CTX || reg->type == PTR_TO_BTF_ID || + reg2btf_ids[base_type(reg->type)]; + /* Check if argument must be a referenced pointer, args + i has * been verified to be a pointer (after skipping modifiers). + * PTR_TO_CTX is ok without having non-zero ref_obj_id. */ - if (is_kfunc && trusted_arg && !reg->ref_obj_id) { + if (is_kfunc && trusted_args && (obj_ptr && reg->type != PTR_TO_CTX) && !reg->ref_obj_id) { bpf_log(log, "R%d must be referenced\n", regno); return -EINVAL; } @@ -6335,7 +6343,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_tname = btf_name_by_offset(btf, ref_t->name_off); /* Trusted args have the same offset checks as release arguments */ - if (trusted_arg || (rel && reg->ref_obj_id)) + if ((trusted_args && obj_ptr) || (rel && reg->ref_obj_id)) arg_type |= OBJ_RELEASE; ret = check_func_arg_reg_off(env, reg, regno, arg_type); if (ret < 0) @@ -6435,7 +6443,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_t->name_off); if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, reg->off, btf, ref_id, - trusted_arg || (rel && reg->ref_obj_id))) { + trusted_args || (rel && reg->ref_obj_id))) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname, From 0fabd2aa199faeb8754aee94658f2c48ccb2c8c3 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 21 Sep 2022 18:48:26 +0200 Subject: [PATCH 077/143] net: netfilter: add bpf_ct_set_nat_info kfunc helper Introduce bpf_ct_set_nat_info kfunc helper in order to set source and destination nat addresses/ports in a new allocated ct entry not inserted in the connection tracking table yet. Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/9567db2fdfa5bebe7b7cc5870f7a34549418b4fc.1663778601.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- net/netfilter/nf_conntrack_bpf.c | 47 +++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 67df64283aef..756ea818574e 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -17,6 +17,7 @@ #include #include #include +#include /* bpf_ct_opts - Options for CT lookup helpers * @@ -137,7 +138,6 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, memset(&ct->proto, 0, sizeof(ct->proto)); __nf_ct_set_timeout(ct, timeout * HZ); - ct->status |= IPS_CONFIRMED; out: if (opts->netns_id >= 0) @@ -390,6 +390,7 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) struct nf_conn *nfct = (struct nf_conn *)nfct_i; int err; + nfct->status |= IPS_CONFIRMED; err = nf_conntrack_hash_check_insert(nfct); if (err < 0) { nf_conntrack_free(nfct); @@ -475,6 +476,49 @@ int bpf_ct_change_status(struct nf_conn *nfct, u32 status) return nf_ct_change_status_common(nfct, status); } +/* bpf_ct_set_nat_info - Set source or destination nat address + * + * Set source or destination nat address of the newly allocated + * nf_conn before insertion. This must be invoked for referenced + * PTR_TO_BTF_ID to nf_conn___init. + * + * Parameters: + * @nfct - Pointer to referenced nf_conn object, obtained using + * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. + * @addr - Nat source/destination address + * @port - Nat source/destination port. Non-positive values are + * interpreted as select a random port. + * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST + */ +int bpf_ct_set_nat_info(struct nf_conn___init *nfct, + union nf_inet_addr *addr, int port, + enum nf_nat_manip_type manip) +{ +#if ((IS_MODULE(CONFIG_NF_NAT) && IS_MODULE(CONFIG_NF_CONNTRACK)) || \ + IS_BUILTIN(CONFIG_NF_NAT)) + struct nf_conn *ct = (struct nf_conn *)nfct; + u16 proto = nf_ct_l3num(ct); + struct nf_nat_range2 range; + + if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6) + return -EINVAL; + + memset(&range, 0, sizeof(struct nf_nat_range2)); + range.flags = NF_NAT_RANGE_MAP_IPS; + range.min_addr = *addr; + range.max_addr = range.min_addr; + if (port > 0) { + range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + range.min_proto.all = cpu_to_be16(port); + range.max_proto.all = range.min_proto.all; + } + + return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; +#else + return -EOPNOTSUPP; +#endif +} + __diag_pop() BTF_SET8_START(nf_ct_kfunc_set) @@ -488,6 +532,7 @@ BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) BTF_SET8_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { From b06b45e82b59b69f5ac6b3916ac5dbd0294efc95 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 21 Sep 2022 18:48:27 +0200 Subject: [PATCH 078/143] selftests/bpf: add tests for bpf_ct_set_nat_info kfunc Introduce self-tests for bpf_ct_set_nat_info kfunc used to set the source or destination nat addresses/ports. Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/803e33294e247744d466943105879414344d3235.1663778601.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/config | 1 + .../testing/selftests/bpf/prog_tests/bpf_nf.c | 10 ++++--- .../testing/selftests/bpf/progs/test_bpf_nf.c | 27 +++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 905a9be8d0a2..9213565c0311 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -63,6 +63,7 @@ CONFIG_NF_CONNTRACK=y CONFIG_NF_CONNTRACK_MARK=y CONFIG_NF_DEFRAG_IPV4=y CONFIG_NF_DEFRAG_IPV6=y +CONFIG_NF_NAT=y CONFIG_RC_CORE=y CONFIG_SECURITY=y CONFIG_SECURITYFS=y diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index 0677a51694c9..8a838ea8bdf3 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -26,7 +26,10 @@ enum { TEST_TC_BPF, }; -#define TIMEOUT_MS 3000 +#define TIMEOUT_MS 3000 +#define IPS_STATUS_MASK (IPS_CONFIRMED | IPS_SEEN_REPLY | \ + IPS_SRC_NAT_DONE | IPS_DST_NAT_DONE | \ + IPS_SRC_NAT | IPS_DST_NAT) static int connect_to_server(int srv_fd) { @@ -114,10 +117,11 @@ static void test_bpf_nf_ct(int mode) ASSERT_GT(skel->bss->test_delta_timeout, 8, "Test for min ct timeout update"); ASSERT_LE(skel->bss->test_delta_timeout, 10, "Test for max ct timeout update"); ASSERT_EQ(skel->bss->test_insert_lookup_mark, 77, "Test for insert and lookup mark value"); - ASSERT_EQ(skel->bss->test_status, IPS_CONFIRMED | IPS_SEEN_REPLY, - "Test for ct status update "); + ASSERT_EQ(skel->bss->test_status, IPS_STATUS_MASK, "Test for ct status update "); ASSERT_EQ(skel->data->test_exist_lookup, 0, "Test existing connection lookup"); ASSERT_EQ(skel->bss->test_exist_lookup_mark, 43, "Test existing connection lookup ctmark"); + ASSERT_EQ(skel->data->test_snat_addr, 0, "Test for source natting"); + ASSERT_EQ(skel->data->test_dnat_addr, 0, "Test for destination natting"); end: if (srv_client_fd != -1) close(srv_client_fd); diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c index 88842da86ddc..227e85e85dda 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #define EAFNOSUPPORT 97 #define EPROTO 71 @@ -24,6 +25,8 @@ int test_succ_lookup = -ENOENT; u32 test_delta_timeout = 0; u32 test_status = 0; u32 test_insert_lookup_mark = 0; +int test_snat_addr = -EINVAL; +int test_dnat_addr = -EINVAL; __be32 saddr = 0; __be16 sport = 0; __be32 daddr = 0; @@ -54,6 +57,8 @@ void bpf_ct_set_timeout(struct nf_conn *, u32) __ksym; int bpf_ct_change_timeout(struct nf_conn *, u32) __ksym; int bpf_ct_set_status(struct nf_conn *, u32) __ksym; int bpf_ct_change_status(struct nf_conn *, u32) __ksym; +int bpf_ct_set_nat_info(struct nf_conn *, union nf_inet_addr *, + int port, enum nf_nat_manip_type) __ksym; static __always_inline void nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, @@ -141,11 +146,22 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, ct = alloc_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def)); if (ct) { + __u16 sport = bpf_get_prandom_u32(); + __u16 dport = bpf_get_prandom_u32(); + union nf_inet_addr saddr = {}; + union nf_inet_addr daddr = {}; struct nf_conn *ct_ins; bpf_ct_set_timeout(ct, 10000); ct->mark = 77; + /* snat */ + saddr.ip = bpf_get_prandom_u32(); + bpf_ct_set_nat_info(ct, &saddr, sport, NF_NAT_MANIP_SRC); + /* dnat */ + daddr.ip = bpf_get_prandom_u32(); + bpf_ct_set_nat_info(ct, &daddr, dport, NF_NAT_MANIP_DST); + ct_ins = bpf_ct_insert_entry(ct); if (ct_ins) { struct nf_conn *ct_lk; @@ -153,6 +169,17 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def)); if (ct_lk) { + struct nf_conntrack_tuple *tuple; + + /* check snat and dnat addresses */ + tuple = &ct_lk->tuplehash[IP_CT_DIR_REPLY].tuple; + if (tuple->dst.u3.ip == saddr.ip && + tuple->dst.u.all == bpf_htons(sport)) + test_snat_addr = 0; + if (tuple->src.u3.ip == daddr.ip && + tuple->src.u.all == bpf_htons(dport)) + test_dnat_addr = 0; + /* update ct entry timeout */ bpf_ct_change_timeout(ct_lk, 10000); test_delta_timeout = ct_lk->timeout - bpf_jiffies64(); From f338ac9105679df504c3809784f0716c25e87b31 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 21 Sep 2022 09:42:51 -0700 Subject: [PATCH 079/143] selftests/bpf: fix double bpf_object__close() in veristate bpf_object__close(obj) is called twice for BPF object files with single BPF program in it. This causes crash. Fix this by not calling bpf_object__close() unnecessarily. Fixes: c8bc5e050976 ("selftests/bpf: Add veristat tool for mass-verifying BPF object files") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220921164254.3630690-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 39e6dc41e504..c0c8a65cda52 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -300,7 +300,6 @@ static int process_obj(const char *filename) prog = bpf_object__next_program(obj, NULL); bpf_program__set_autoload(prog, true); process_prog(filename, obj, prog); - bpf_object__close(obj); goto cleanup; } From e5eb08d8fe469c0da8643042893a0b7481807443 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 21 Sep 2022 09:42:52 -0700 Subject: [PATCH 080/143] selftests/bpf: add CSV output mode for veristat Teach veristat to output results as CSV table for easier programmatic processing. Change what was --output/-o argument to now be --emit/-e. And then use --output-format/-o to specify output format. Currently "table" and "csv" is supported, table being default. For CSV output mode veristat is using spec identifiers as column names. E.g., instead of "Total states" veristat uses "total_states" as a CSV header name. Internally veristat recognizes three formats, one of them (RESFMT_TABLE_CALCLEN) is a special format instructing veristat to calculate column widths for table output. This felt a bit cleaner and more uniform than either creating separate functions just for this. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220921164254.3630690-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 115 +++++++++++++++++-------- 1 file changed, 77 insertions(+), 38 deletions(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index c0c8a65cda52..0472bfae3c9d 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -46,10 +46,17 @@ struct stat_specs { int lens[ALL_STATS_CNT]; }; +enum resfmt { + RESFMT_TABLE, + RESFMT_TABLE_CALCLEN, /* fake format to pre-calculate table's column widths */ + RESFMT_CSV, +}; + static struct env { char **filenames; int filename_cnt; bool verbose; + enum resfmt out_fmt; struct verif_stats *prog_stats; int prog_stat_cnt; @@ -78,8 +85,9 @@ const char argp_program_doc[] = static const struct argp_option opts[] = { { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, { "verbose", 'v', NULL, 0, "Verbose mode" }, - { "output", 'o', "SPEC", 0, "Specify output stats" }, + { "emit", 'e', "SPEC", 0, "Specify stats to be emitted" }, { "sort", 's', "SPEC", 0, "Specify sort order" }, + { "output-format", 'o', "FMT", 0, "Result output format (table, csv), default is table." }, {}, }; @@ -97,7 +105,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'v': env.verbose = true; break; - case 'o': + case 'e': err = parse_stats(arg, &env.output_spec); if (err) return err; @@ -107,6 +115,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) if (err) return err; break; + case 'o': + if (strcmp(arg, "table") == 0) { + env.out_fmt = RESFMT_TABLE; + } else if (strcmp(arg, "csv") == 0) { + env.out_fmt = RESFMT_CSV; + } else { + fprintf(stderr, "Unrecognized output format '%s'\n", arg); + return -EINVAL; + } + break; case ARGP_KEY_ARG: tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames)); if (!tmp) @@ -147,7 +165,7 @@ static struct stat_def { [FILE_NAME] = { "File", {"file_name", "filename", "file"}, true /* asc */ }, [PROG_NAME] = { "Program", {"prog_name", "progname", "prog"}, true /* asc */ }, [VERDICT] = { "Verdict", {"verdict"}, true /* asc: failure, success */ }, - [DURATION] = { "Duration, us", {"duration", "dur"}, }, + [DURATION] = { "Duration (us)", {"duration", "dur"}, }, [TOTAL_INSNS] = { "Total insns", {"total_insns", "insns"}, }, [TOTAL_STATES] = { "Total states", {"total_states", "states"}, }, [PEAK_STATES] = { "Peak states", {"peak_states"}, }, @@ -385,27 +403,6 @@ static int cmp_prog_stats(const void *v1, const void *v2) #define HEADER_CHAR '-' #define COLUMN_SEP " " -static void output_headers(bool calc_len) -{ - int i, len; - - for (i = 0; i < env.output_spec.spec_cnt; i++) { - int id = env.output_spec.ids[i]; - int *max_len = &env.output_spec.lens[i]; - - if (calc_len) { - len = snprintf(NULL, 0, "%s", stat_defs[id].header); - if (len > *max_len) - *max_len = len; - } else { - printf("%s%-*s", i == 0 ? "" : COLUMN_SEP, *max_len, stat_defs[id].header); - } - } - - if (!calc_len) - printf("\n"); -} - static void output_header_underlines(void) { int i, j, len; @@ -420,7 +417,38 @@ static void output_header_underlines(void) printf("\n"); } -static void output_stats(const struct verif_stats *s, bool calc_len) +static void output_headers(enum resfmt fmt) +{ + int i, len; + + for (i = 0; i < env.output_spec.spec_cnt; i++) { + int id = env.output_spec.ids[i]; + int *max_len = &env.output_spec.lens[i]; + + switch (fmt) { + case RESFMT_TABLE_CALCLEN: + len = snprintf(NULL, 0, "%s", stat_defs[id].header); + if (len > *max_len) + *max_len = len; + break; + case RESFMT_TABLE: + printf("%s%-*s", i == 0 ? "" : COLUMN_SEP, *max_len, stat_defs[id].header); + if (i == env.output_spec.spec_cnt - 1) + printf("\n"); + break; + case RESFMT_CSV: + printf("%s%s", i == 0 ? "" : ",", stat_defs[id].names[0]); + if (i == env.output_spec.spec_cnt - 1) + printf("\n"); + break; + } + } + + if (fmt == RESFMT_TABLE) + output_header_underlines(); +} + +static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last) { int i; @@ -453,23 +481,36 @@ static void output_stats(const struct verif_stats *s, bool calc_len) exit(1); } - if (calc_len) { + switch (fmt) { + case RESFMT_TABLE_CALCLEN: if (str) len = snprintf(NULL, 0, "%s", str); else len = snprintf(NULL, 0, "%ld", val); if (len > *max_len) *max_len = len; - } else { + break; + case RESFMT_TABLE: if (str) printf("%s%-*s", i == 0 ? "" : COLUMN_SEP, *max_len, str); else printf("%s%*ld", i == 0 ? "" : COLUMN_SEP, *max_len, val); + if (i == env.output_spec.spec_cnt - 1) + printf("\n"); + break; + case RESFMT_CSV: + if (str) + printf("%s%s", i == 0 ? "" : ",", str); + else + printf("%s%ld", i == 0 ? "" : ",", val); + if (i == env.output_spec.spec_cnt - 1) + printf("\n"); + break; } } - if (!calc_len) - printf("\n"); + if (last && fmt == RESFMT_TABLE) + output_header_underlines(); } int main(int argc, char **argv) @@ -505,20 +546,18 @@ int main(int argc, char **argv) qsort(env.prog_stats, env.prog_stat_cnt, sizeof(*env.prog_stats), cmp_prog_stats); - /* calculate column widths */ - output_headers(true); - for (i = 0; i < env.prog_stat_cnt; i++) { - output_stats(&env.prog_stats[i], true); + if (env.out_fmt == RESFMT_TABLE) { + /* calculate column widths */ + output_headers(RESFMT_TABLE_CALCLEN); + for (i = 0; i < env.prog_stat_cnt; i++) + output_stats(&env.prog_stats[i], RESFMT_TABLE_CALCLEN, false); } /* actually output the table */ - output_headers(false); - output_header_underlines(); + output_headers(env.out_fmt); for (i = 0; i < env.prog_stat_cnt; i++) { - output_stats(&env.prog_stats[i], false); + output_stats(&env.prog_stats[i], env.out_fmt, i == env.prog_stat_cnt - 1); } - output_header_underlines(); - printf("\n"); printf("Done. Processed %d object files, %d programs.\n", env.filename_cnt, env.prog_stat_cnt); From 394169b079b558cf91a9c23ffb6b55c14cd927e1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 21 Sep 2022 09:42:53 -0700 Subject: [PATCH 081/143] selftests/bpf: add comparison mode to veristat Add ability to compare and contrast two veristat runs, previously recorded with veristat using CSV output format. When veristat is called with -C (--compare) flag, veristat expects exactly two input files specified, both should be in CSV format. Expectation is that it's output from previous veristat runs, but as long as column names and formats match, it should just work. First CSV file is designated as a "baseline" provided, and the second one is comparison (experiment) data set. Establishing baseline matters later when calculating difference percentages, see below. Veristat parses these two CSV files and "reconstructs" verifier stats (it could be just a subset of all possible stats). File and program names are mandatory as they are used as joining key (these two "stats" are designated as "key stats" in the code). Veristat currently enforces that the set of stats recorded in both CSV has to exactly match, down to exact order. This is just a simplifying condition which can be lifted with a bit of additional pre-processing to reorded stat specs internally, which I didn't bother doing, yet. For all the non-key stats, veristat will output three columns: one for baseline data, one for comparison data, and one with an absolute and relative percentage difference. If either baseline or comparison values are missing (that is, respective CSV file doesn't have a row with *exactly* matching file and program name), those values are assumed to be empty or zero. In such case relative percentages are forced to +100% or -100% output, for consistency with a typical case. Veristat's -e (--emit) and -s (--sort) specs still apply, so even if CSV contains lots of stats, user can request to compare only a subset of them (and specify desired column order as well). Similarly, both CSV and human-readable table output is honored. Note that input is currently always expected to be CSV. Here's an example shell session, recording data for biosnoop tool on two different kernels and comparing them afterwards, outputting data in table format. # on slightly older production kernel $ sudo ./veristat biosnoop_bpf.o File Program Verdict Duration (us) Total insns Total states Peak states -------------- ------------------------ ------- ------------- ----------- ------------ ----------- biosnoop_bpf.o blk_account_io_merge_bio success 37 24 1 1 biosnoop_bpf.o blk_account_io_start failure 0 0 0 0 biosnoop_bpf.o block_rq_complete success 76 104 6 6 biosnoop_bpf.o block_rq_insert success 83 85 7 7 biosnoop_bpf.o block_rq_issue success 79 85 7 7 -------------- ------------------------ ------- ------------- ----------- ------------ ----------- Done. Processed 1 object files, 5 programs. $ sudo ./veristat ~/local/tmp/fbcode-bpf-objs/biosnoop_bpf.o -o csv > baseline.csv $ cat baseline.csv file_name,prog_name,verdict,duration,total_insns,total_states,peak_states biosnoop_bpf.o,blk_account_io_merge_bio,success,36,24,1,1 biosnoop_bpf.o,blk_account_io_start,failure,0,0,0,0 biosnoop_bpf.o,block_rq_complete,success,82,104,6,6 biosnoop_bpf.o,block_rq_insert,success,78,85,7,7 biosnoop_bpf.o,block_rq_issue,success,74,85,7,7 # on latest bpf-next kernel $ sudo ./veristat biosnoop_bpf.o File Program Verdict Duration (us) Total insns Total states Peak states -------------- ------------------------ ------- ------------- ----------- ------------ ----------- biosnoop_bpf.o blk_account_io_merge_bio success 31 24 1 1 biosnoop_bpf.o blk_account_io_start failure 0 0 0 0 biosnoop_bpf.o block_rq_complete success 76 104 6 6 biosnoop_bpf.o block_rq_insert success 83 91 7 7 biosnoop_bpf.o block_rq_issue success 74 91 7 7 -------------- ------------------------ ------- ------------- ----------- ------------ ----------- Done. Processed 1 object files, 5 programs. $ sudo ./veristat biosnoop_bpf.o -o csv > comparison.csv $ cat comparison.csv file_name,prog_name,verdict,duration,total_insns,total_states,peak_states biosnoop_bpf.o,blk_account_io_merge_bio,success,71,24,1,1 biosnoop_bpf.o,blk_account_io_start,failure,0,0,0,0 biosnoop_bpf.o,block_rq_complete,success,82,104,6,6 biosnoop_bpf.o,block_rq_insert,success,83,91,7,7 biosnoop_bpf.o,block_rq_issue,success,87,91,7,7 # now let's compare with human-readable output (note that no sudo needed) # we also ignore verification duration in this case to shortned output $ ./veristat -C baseline.csv comparison.csv -e file,prog,verdict,insns File Program Verdict (A) Verdict (B) Verdict (DIFF) Total insns (A) Total insns (B) Total insns (DIFF) -------------- ------------------------ ----------- ----------- -------------- --------------- --------------- ------------------ biosnoop_bpf.o blk_account_io_merge_bio success success MATCH 24 24 +0 (+0.00%) biosnoop_bpf.o blk_account_io_start failure failure MATCH 0 0 +0 (+100.00%) biosnoop_bpf.o block_rq_complete success success MATCH 104 104 +0 (+0.00%) biosnoop_bpf.o block_rq_insert success success MATCH 91 85 -6 (-6.59%) biosnoop_bpf.o block_rq_issue success success MATCH 91 85 -6 (-6.59%) -------------- ------------------------ ----------- ----------- -------------- --------------- --------------- ------------------ While not particularly exciting example (it turned out to be kind of hard to quickly find a nice example with significant difference just because of kernel version bump), it should demonstrate main features. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220921164254.3630690-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 543 ++++++++++++++++++++++--- 1 file changed, 492 insertions(+), 51 deletions(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 0472bfae3c9d..c6837bac357f 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -43,7 +43,7 @@ struct stat_specs { int spec_cnt; enum stat_id ids[ALL_STATS_CNT]; bool asc[ALL_STATS_CNT]; - int lens[ALL_STATS_CNT]; + int lens[ALL_STATS_CNT * 3]; /* 3x for comparison mode */ }; enum resfmt { @@ -57,16 +57,20 @@ static struct env { int filename_cnt; bool verbose; enum resfmt out_fmt; + bool comparison_mode; struct verif_stats *prog_stats; int prog_stat_cnt; + /* baseline_stats is allocated and used only in comparsion mode */ + struct verif_stats *baseline_stats; + int baseline_stat_cnt; + struct stat_specs output_spec; struct stat_specs sort_spec; } env; -static int libbpf_print_fn(enum libbpf_print_level level, - const char *format, va_list args) +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) { if (!env.verbose) return 0; @@ -78,9 +82,10 @@ static int libbpf_print_fn(enum libbpf_print_level level, const char *argp_program_version = "veristat"; const char *argp_program_bug_address = ""; const char argp_program_doc[] = -"veristat BPF verifier stats collection tool.\n" +"veristat BPF verifier stats collection and comparison tool.\n" "\n" -"USAGE: veristat [...]\n"; +"USAGE: veristat [...]\n" +" OR: veristat -C \n"; static const struct argp_option opts[] = { { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, @@ -88,6 +93,7 @@ static const struct argp_option opts[] = { { "emit", 'e', "SPEC", 0, "Specify stats to be emitted" }, { "sort", 's', "SPEC", 0, "Specify sort order" }, { "output-format", 'o', "FMT", 0, "Result output format (table, csv), default is table." }, + { "compare", 'C', NULL, 0, "Comparison mode" }, {}, }; @@ -125,6 +131,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return -EINVAL; } break; + case 'C': + env.comparison_mode = true; + break; case ARGP_KEY_ARG: tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames)); if (!tmp) @@ -141,6 +150,12 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return 0; } +static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, +}; + static const struct stat_specs default_output_spec = { .spec_cnt = 7, .ids = { @@ -219,6 +234,20 @@ static int parse_stats(const char *stats_str, struct stat_specs *specs) return 0; } +static void free_verif_stats(struct verif_stats *stats, size_t stat_cnt) +{ + int i; + + if (!stats) + return; + + for (i = 0; i < stat_cnt; i++) { + free(stats[i].file_name); + free(stats[i].prog_name); + } + free(stats); +} + static char verif_log_buf[64 * 1024]; static int parse_verif_log(const char *buf, size_t buf_sz, struct verif_stats *s) @@ -448,6 +477,33 @@ static void output_headers(enum resfmt fmt) output_header_underlines(); } +static void prepare_value(const struct verif_stats *s, enum stat_id id, + const char **str, long *val) +{ + switch (id) { + case FILE_NAME: + *str = s->file_name; + break; + case PROG_NAME: + *str = s->prog_name; + break; + case VERDICT: + *str = s->stats[VERDICT] ? "success" : "failure"; + break; + case DURATION: + case TOTAL_INSNS: + case TOTAL_STATES: + case PEAK_STATES: + case MAX_STATES_PER_INSN: + case MARK_READ_MAX_LEN: + *val = s->stats[id]; + break; + default: + fprintf(stderr, "Unrecognized stat #%d\n", id); + exit(1); + } +} + static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last) { int i; @@ -458,28 +514,7 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last const char *str = NULL; long val = 0; - switch (id) { - case FILE_NAME: - str = s->file_name; - break; - case PROG_NAME: - str = s->prog_name; - break; - case VERDICT: - str = s->stats[VERDICT] ? "success" : "failure"; - break; - case DURATION: - case TOTAL_INSNS: - case TOTAL_STATES: - case PEAK_STATES: - case MAX_STATES_PER_INSN: - case MARK_READ_MAX_LEN: - val = s->stats[id]; - break; - default: - fprintf(stderr, "Unrecognized stat #%d\n", id); - exit(1); - } + prepare_value(s, id, &str, &val); switch (fmt) { case RESFMT_TABLE_CALCLEN: @@ -509,38 +544,28 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last } } - if (last && fmt == RESFMT_TABLE) + if (last && fmt == RESFMT_TABLE) { output_header_underlines(); + printf("Done. Processed %d object files, %d programs.\n", + env.filename_cnt, env.prog_stat_cnt); + } } -int main(int argc, char **argv) +static int handle_verif_mode(void) { - static const struct argp argp = { - .options = opts, - .parser = parse_arg, - .doc = argp_program_doc, - }; - int err = 0, i; - - if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) - return 1; + int i, err; if (env.filename_cnt == 0) { fprintf(stderr, "Please provide path to BPF object file!\n"); argp_help(&argp, stderr, ARGP_HELP_USAGE, "veristat"); - return 1; + return -EINVAL; } - if (env.output_spec.spec_cnt == 0) - env.output_spec = default_output_spec; - if (env.sort_spec.spec_cnt == 0) - env.sort_spec = default_sort_spec; - for (i = 0; i < env.filename_cnt; i++) { err = process_obj(env.filenames[i]); if (err) { fprintf(stderr, "Failed to process '%s': %d\n", env.filenames[i], err); - goto cleanup; + return err; } } @@ -559,15 +584,431 @@ int main(int argc, char **argv) output_stats(&env.prog_stats[i], env.out_fmt, i == env.prog_stat_cnt - 1); } - printf("Done. Processed %d object files, %d programs.\n", - env.filename_cnt, env.prog_stat_cnt); + return 0; +} + +static int parse_stat_value(const char *str, enum stat_id id, struct verif_stats *st) +{ + switch (id) { + case FILE_NAME: + st->file_name = strdup(str); + if (!st->file_name) + return -ENOMEM; + break; + case PROG_NAME: + st->prog_name = strdup(str); + if (!st->prog_name) + return -ENOMEM; + break; + case VERDICT: + if (strcmp(str, "success") == 0) { + st->stats[VERDICT] = true; + } else if (strcmp(str, "failure") == 0) { + st->stats[VERDICT] = false; + } else { + fprintf(stderr, "Unrecognized verification verdict '%s'\n", str); + return -EINVAL; + } + break; + case DURATION: + case TOTAL_INSNS: + case TOTAL_STATES: + case PEAK_STATES: + case MAX_STATES_PER_INSN: + case MARK_READ_MAX_LEN: { + long val; + int err, n; + + if (sscanf(str, "%ld %n", &val, &n) != 1 || n != strlen(str)) { + err = -errno; + fprintf(stderr, "Failed to parse '%s' as integer\n", str); + return err; + } + + st->stats[id] = val; + break; + } + default: + fprintf(stderr, "Unrecognized stat #%d\n", id); + return -EINVAL; + } + return 0; +} + +static int parse_stats_csv(const char *filename, struct stat_specs *specs, + struct verif_stats **statsp, int *stat_cntp) +{ + char line[4096]; + FILE *f; + int err = 0; + bool header = true; + + f = fopen(filename, "r"); + if (!f) { + err = -errno; + fprintf(stderr, "Failed to open '%s': %d\n", filename, err); + return err; + } + + *stat_cntp = 0; + + while (fgets(line, sizeof(line), f)) { + char *input = line, *state = NULL, *next; + struct verif_stats *st = NULL; + int col = 0; + + if (!header) { + void *tmp; + + tmp = realloc(*statsp, (*stat_cntp + 1) * sizeof(**statsp)); + if (!tmp) { + err = -ENOMEM; + goto cleanup; + } + *statsp = tmp; + st = &(*statsp)[*stat_cntp]; + *stat_cntp += 1; + } + + while ((next = strtok_r(state ? NULL : input, ",\n", &state))) { + if (header) { + /* for the first line, set up spec stats */ + err = parse_stat(next, specs); + if (err) + goto cleanup; + continue; + } + + /* for all other lines, parse values based on spec */ + if (col >= specs->spec_cnt) { + fprintf(stderr, "Found extraneous column #%d in row #%d of '%s'\n", + col, *stat_cntp, filename); + err = -EINVAL; + goto cleanup; + } + err = parse_stat_value(next, specs->ids[col], st); + if (err) + goto cleanup; + col++; + } + + if (!header && col < specs->spec_cnt) { + fprintf(stderr, "Not enough columns in row #%d in '%s'\n", + *stat_cntp, filename); + err = -EINVAL; + goto cleanup; + } + + header = false; + } + + if (!feof(f)) { + err = -errno; + fprintf(stderr, "Failed I/O for '%s': %d\n", filename, err); + } cleanup: - for (i = 0; i < env.prog_stat_cnt; i++) { - free(env.prog_stats[i].file_name); - free(env.prog_stats[i].prog_name); + fclose(f); + return err; +} + +/* empty/zero stats for mismatched rows */ +static const struct verif_stats fallback_stats = { .file_name = "", .prog_name = "" }; + +static bool is_key_stat(enum stat_id id) +{ + return id == FILE_NAME || id == PROG_NAME; +} + +static void output_comp_header_underlines(void) +{ + int i, j, k; + + for (i = 0; i < env.output_spec.spec_cnt; i++) { + int id = env.output_spec.ids[i]; + int max_j = is_key_stat(id) ? 1 : 3; + + for (j = 0; j < max_j; j++) { + int len = env.output_spec.lens[3 * i + j]; + + printf("%s", i + j == 0 ? "" : COLUMN_SEP); + + for (k = 0; k < len; k++) + printf("%c", HEADER_CHAR); + } } - free(env.prog_stats); + printf("\n"); +} + +static void output_comp_headers(enum resfmt fmt) +{ + static const char *table_sfxs[3] = {" (A)", " (B)", " (DIFF)"}; + static const char *name_sfxs[3] = {"_base", "_comp", "_diff"}; + int i, j, len; + + for (i = 0; i < env.output_spec.spec_cnt; i++) { + int id = env.output_spec.ids[i]; + /* key stats don't have A/B/DIFF columns, they are common for both data sets */ + int max_j = is_key_stat(id) ? 1 : 3; + + for (j = 0; j < max_j; j++) { + int *max_len = &env.output_spec.lens[3 * i + j]; + bool last = (i == env.output_spec.spec_cnt - 1) && (j == max_j - 1); + const char *sfx; + + switch (fmt) { + case RESFMT_TABLE_CALCLEN: + sfx = is_key_stat(id) ? "" : table_sfxs[j]; + len = snprintf(NULL, 0, "%s%s", stat_defs[id].header, sfx); + if (len > *max_len) + *max_len = len; + break; + case RESFMT_TABLE: + sfx = is_key_stat(id) ? "" : table_sfxs[j]; + printf("%s%-*s%s", i + j == 0 ? "" : COLUMN_SEP, + *max_len - (int)strlen(sfx), stat_defs[id].header, sfx); + if (last) + printf("\n"); + break; + case RESFMT_CSV: + sfx = is_key_stat(id) ? "" : name_sfxs[j]; + printf("%s%s%s", i + j == 0 ? "" : ",", stat_defs[id].names[0], sfx); + if (last) + printf("\n"); + break; + } + } + } + + if (fmt == RESFMT_TABLE) + output_comp_header_underlines(); +} + +static void output_comp_stats(const struct verif_stats *base, const struct verif_stats *comp, + enum resfmt fmt, bool last) +{ + char base_buf[1024] = {}, comp_buf[1024] = {}, diff_buf[1024] = {}; + int i; + + for (i = 0; i < env.output_spec.spec_cnt; i++) { + int id = env.output_spec.ids[i], len; + int *max_len_base = &env.output_spec.lens[3 * i + 0]; + int *max_len_comp = &env.output_spec.lens[3 * i + 1]; + int *max_len_diff = &env.output_spec.lens[3 * i + 2]; + const char *base_str = NULL, *comp_str = NULL; + long base_val = 0, comp_val = 0, diff_val = 0; + + prepare_value(base, id, &base_str, &base_val); + prepare_value(comp, id, &comp_str, &comp_val); + + /* normalize all the outputs to be in string buffers for simplicity */ + if (is_key_stat(id)) { + /* key stats (file and program name) are always strings */ + if (base != &fallback_stats) + snprintf(base_buf, sizeof(base_buf), "%s", base_str); + else + snprintf(base_buf, sizeof(base_buf), "%s", comp_str); + } else if (base_str) { + snprintf(base_buf, sizeof(base_buf), "%s", base_str); + snprintf(comp_buf, sizeof(comp_buf), "%s", comp_str); + if (strcmp(base_str, comp_str) == 0) + snprintf(diff_buf, sizeof(diff_buf), "%s", "MATCH"); + else + snprintf(diff_buf, sizeof(diff_buf), "%s", "MISMATCH"); + } else { + snprintf(base_buf, sizeof(base_buf), "%ld", base_val); + snprintf(comp_buf, sizeof(comp_buf), "%ld", comp_val); + + diff_val = comp_val - base_val; + if (base == &fallback_stats || comp == &fallback_stats || base_val == 0) { + snprintf(diff_buf, sizeof(diff_buf), "%+ld (%+.2lf%%)", + diff_val, comp_val < base_val ? -100.0 : 100.0); + } else { + snprintf(diff_buf, sizeof(diff_buf), "%+ld (%+.2lf%%)", + diff_val, diff_val * 100.0 / base_val); + } + } + + switch (fmt) { + case RESFMT_TABLE_CALCLEN: + len = strlen(base_buf); + if (len > *max_len_base) + *max_len_base = len; + if (!is_key_stat(id)) { + len = strlen(comp_buf); + if (len > *max_len_comp) + *max_len_comp = len; + len = strlen(diff_buf); + if (len > *max_len_diff) + *max_len_diff = len; + } + break; + case RESFMT_TABLE: { + /* string outputs are left-aligned, number outputs are right-aligned */ + const char *fmt = base_str ? "%s%-*s" : "%s%*s"; + + printf(fmt, i == 0 ? "" : COLUMN_SEP, *max_len_base, base_buf); + if (!is_key_stat(id)) { + printf(fmt, COLUMN_SEP, *max_len_comp, comp_buf); + printf(fmt, COLUMN_SEP, *max_len_diff, diff_buf); + } + if (i == env.output_spec.spec_cnt - 1) + printf("\n"); + break; + } + case RESFMT_CSV: + printf("%s%s", i == 0 ? "" : ",", base_buf); + if (!is_key_stat(id)) { + printf("%s%s", i == 0 ? "" : ",", comp_buf); + printf("%s%s", i == 0 ? "" : ",", diff_buf); + } + if (i == env.output_spec.spec_cnt - 1) + printf("\n"); + break; + } + } + + if (last && fmt == RESFMT_TABLE) + output_comp_header_underlines(); +} + +static int cmp_stats_key(const struct verif_stats *base, const struct verif_stats *comp) +{ + int r; + + r = strcmp(base->file_name, comp->file_name); + if (r != 0) + return r; + return strcmp(base->prog_name, comp->prog_name); +} + +static int handle_comparison_mode(void) +{ + struct stat_specs base_specs = {}, comp_specs = {}; + enum resfmt cur_fmt; + int err, i, j; + + if (env.filename_cnt != 2) { + fprintf(stderr, "Comparison mode expects exactly two input CSV files!\n"); + argp_help(&argp, stderr, ARGP_HELP_USAGE, "veristat"); + return -EINVAL; + } + + err = parse_stats_csv(env.filenames[0], &base_specs, + &env.baseline_stats, &env.baseline_stat_cnt); + if (err) { + fprintf(stderr, "Failed to parse stats from '%s': %d\n", env.filenames[0], err); + return err; + } + err = parse_stats_csv(env.filenames[1], &comp_specs, + &env.prog_stats, &env.prog_stat_cnt); + if (err) { + fprintf(stderr, "Failed to parse stats from '%s': %d\n", env.filenames[1], err); + return err; + } + + /* To keep it simple we validate that the set and order of stats in + * both CSVs are exactly the same. This can be lifted with a bit more + * pre-processing later. + */ + if (base_specs.spec_cnt != comp_specs.spec_cnt) { + fprintf(stderr, "Number of stats in '%s' and '%s' differs (%d != %d)!\n", + env.filenames[0], env.filenames[1], + base_specs.spec_cnt, comp_specs.spec_cnt); + return -EINVAL; + } + for (i = 0; i < base_specs.spec_cnt; i++) { + if (base_specs.ids[i] != comp_specs.ids[i]) { + fprintf(stderr, "Stats composition differs between '%s' and '%s' (%s != %s)!\n", + env.filenames[0], env.filenames[1], + stat_defs[base_specs.ids[i]].names[0], + stat_defs[comp_specs.ids[i]].names[0]); + return -EINVAL; + } + } + + qsort(env.prog_stats, env.prog_stat_cnt, sizeof(*env.prog_stats), cmp_prog_stats); + qsort(env.baseline_stats, env.baseline_stat_cnt, sizeof(*env.baseline_stats), cmp_prog_stats); + + /* for human-readable table output we need to do extra pass to + * calculate column widths, so we substitute current output format + * with RESFMT_TABLE_CALCLEN and later revert it back to RESFMT_TABLE + * and do everything again. + */ + if (env.out_fmt == RESFMT_TABLE) + cur_fmt = RESFMT_TABLE_CALCLEN; + else + cur_fmt = env.out_fmt; + +one_more_time: + output_comp_headers(cur_fmt); + + /* If baseline and comparison datasets have different subset of rows + * (we match by 'object + prog' as a unique key) then assume + * empty/missing/zero value for rows that are missing in the opposite + * data set + */ + i = j = 0; + while (i < env.baseline_stat_cnt || j < env.prog_stat_cnt) { + bool last = (i == env.baseline_stat_cnt - 1) || (j == env.prog_stat_cnt - 1); + const struct verif_stats *base, *comp; + int r; + + base = i < env.baseline_stat_cnt ? &env.baseline_stats[i] : &fallback_stats; + comp = j < env.prog_stat_cnt ? &env.prog_stats[j] : &fallback_stats; + + if (!base->file_name || !base->prog_name) { + fprintf(stderr, "Entry #%d in '%s' doesn't have file and/or program name specified!\n", + i, env.filenames[0]); + return -EINVAL; + } + if (!comp->file_name || !comp->prog_name) { + fprintf(stderr, "Entry #%d in '%s' doesn't have file and/or program name specified!\n", + j, env.filenames[1]); + return -EINVAL; + } + + r = cmp_stats_key(base, comp); + if (r == 0) { + output_comp_stats(base, comp, cur_fmt, last); + i++; + j++; + } else if (comp == &fallback_stats || r < 0) { + output_comp_stats(base, &fallback_stats, cur_fmt, last); + i++; + } else { + output_comp_stats(&fallback_stats, comp, cur_fmt, last); + j++; + } + } + + if (cur_fmt == RESFMT_TABLE_CALCLEN) { + cur_fmt = RESFMT_TABLE; + goto one_more_time; /* ... this time with feeling */ + } + + return 0; +} + +int main(int argc, char **argv) +{ + int err = 0, i; + + if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) + return 1; + + if (env.output_spec.spec_cnt == 0) + env.output_spec = default_output_spec; + if (env.sort_spec.spec_cnt == 0) + env.sort_spec = default_sort_spec; + + if (env.comparison_mode) + err = handle_comparison_mode(); + else + err = handle_verif_mode(); + + free_verif_stats(env.prog_stats, env.prog_stat_cnt); + free_verif_stats(env.baseline_stats, env.baseline_stat_cnt); for (i = 0; i < env.filename_cnt; i++) free(env.filenames[i]); free(env.filenames); From bde4a96cdcadc1f9c92cc2715a0022545bfb3201 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 21 Sep 2022 09:42:54 -0700 Subject: [PATCH 082/143] selftests/bpf: add ability to filter programs in veristat Add -f (--filter) argument which accepts glob-based filters for narrowing down what BPF object files and programs within them should be processed by veristat. This filtering applies both to comparison and main (verification) mode. Filter can be of two forms: - file (object) filter: 'strobemeta*'; in this case all the programs within matching files are implicitly allowed (or denied, depending if it's positive or negative rule, see below); - file and prog filter: 'strobemeta*/*unroll*' will further filter programs within matching files to only allow those program names that match '*unroll*' glob. As mentioned, filters can be positive (allowlisting) and negative (denylisting). Negative filters should start with '!': '!strobemeta*' will deny any filename which basename starts with "strobemeta". Further, one extra special syntax is supported to allow more convenient use in practice. Instead of specifying rule on the command line, veristat allows to specify file that contains rules, both positive and negative, one line per one filter. This is achieved with -f @ use, where points to a text file containing rules (negative and positive rules can be mixed). For convenience empty lines and lines starting with '#' are ignored. This feature is useful to have some pre-canned list of object files and program names that are tested repeatedly, allowing to check in a list of rules and quickly specify them on the command line. As a demonstration (and a short cut for nearest future), create a small list of "interesting" BPF object files from selftests/bpf and commit it as veristat.cfg. It currently includes 73 programs, most of which are the most complex and largest BPF programs in selftests, as judged by total verified instruction count and verifier states total. If there is overlap between positive or negative filters, negative filter takes precedence (denylisting is stronger than allowlisting). If no allow filter is specified, veristat implicitly assumes '*/*' rule. If no deny rule is specified, veristat (logically) assumes no negative filters. Also note that -f (just like -e and -s) can be specified multiple times and their effect is cumulative. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220921164254.3630690-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 212 ++++++++++++++++++++++- tools/testing/selftests/bpf/veristat.cfg | 17 ++ 2 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/veristat.cfg diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index c6837bac357f..51030234b60a 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -52,6 +52,11 @@ enum resfmt { RESFMT_CSV, }; +struct filter { + char *file_glob; + char *prog_glob; +}; + static struct env { char **filenames; int filename_cnt; @@ -68,6 +73,11 @@ static struct env { struct stat_specs output_spec; struct stat_specs sort_spec; + + struct filter *allow_filters; + struct filter *deny_filters; + int allow_filter_cnt; + int deny_filter_cnt; } env; static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) @@ -94,10 +104,13 @@ static const struct argp_option opts[] = { { "sort", 's', "SPEC", 0, "Specify sort order" }, { "output-format", 'o', "FMT", 0, "Result output format (table, csv), default is table." }, { "compare", 'C', NULL, 0, "Comparison mode" }, + { "filter", 'f', "FILTER", 0, "Filter expressions (or @filename for file with expressions)." }, {}, }; static int parse_stats(const char *stats_str, struct stat_specs *specs); +static int append_filter(struct filter **filters, int *cnt, const char *str); +static int append_filter_file(const char *path); static error_t parse_arg(int key, char *arg, struct argp_state *state) { @@ -134,6 +147,18 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'C': env.comparison_mode = true; break; + case 'f': + if (arg[0] == '@') + err = append_filter_file(arg + 1); + else if (arg[0] == '!') + err = append_filter(&env.deny_filters, &env.deny_filter_cnt, arg + 1); + else + err = append_filter(&env.allow_filters, &env.allow_filter_cnt, arg); + if (err) { + fprintf(stderr, "Failed to collect program filter expressions: %d\n", err); + return err; + } + break; case ARGP_KEY_ARG: tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames)); if (!tmp) @@ -156,6 +181,150 @@ static const struct argp argp = { .doc = argp_program_doc, }; + +/* Adapted from perf/util/string.c */ +static bool glob_matches(const char *str, const char *pat) +{ + while (*str && *pat && *pat != '*') { + if (*str != *pat) + return false; + str++; + pat++; + } + /* Check wild card */ + if (*pat == '*') { + while (*pat == '*') + pat++; + if (!*pat) /* Tail wild card matches all */ + return true; + while (*str) + if (glob_matches(str++, pat)) + return true; + } + return !*str && !*pat; +} + +static bool should_process_file(const char *filename) +{ + int i; + + if (env.deny_filter_cnt > 0) { + for (i = 0; i < env.deny_filter_cnt; i++) { + if (glob_matches(filename, env.deny_filters[i].file_glob)) + return false; + } + } + + if (env.allow_filter_cnt == 0) + return true; + + for (i = 0; i < env.allow_filter_cnt; i++) { + if (glob_matches(filename, env.allow_filters[i].file_glob)) + return true; + } + + return false; +} + +static bool should_process_prog(const char *filename, const char *prog_name) +{ + int i; + + if (env.deny_filter_cnt > 0) { + for (i = 0; i < env.deny_filter_cnt; i++) { + if (glob_matches(filename, env.deny_filters[i].file_glob)) + return false; + if (!env.deny_filters[i].prog_glob) + continue; + if (glob_matches(prog_name, env.deny_filters[i].prog_glob)) + return false; + } + } + + if (env.allow_filter_cnt == 0) + return true; + + for (i = 0; i < env.allow_filter_cnt; i++) { + if (!glob_matches(filename, env.allow_filters[i].file_glob)) + continue; + /* if filter specifies only filename glob part, it implicitly + * allows all progs within that file + */ + if (!env.allow_filters[i].prog_glob) + return true; + if (glob_matches(prog_name, env.allow_filters[i].prog_glob)) + return true; + } + + return false; +} + +static int append_filter(struct filter **filters, int *cnt, const char *str) +{ + struct filter *f; + void *tmp; + const char *p; + + tmp = realloc(*filters, (*cnt + 1) * sizeof(**filters)); + if (!tmp) + return -ENOMEM; + *filters = tmp; + + f = &(*filters)[*cnt]; + f->file_glob = f->prog_glob = NULL; + + /* filter can be specified either as "" or "/" */ + p = strchr(str, '/'); + if (!p) { + f->file_glob = strdup(str); + if (!f->file_glob) + return -ENOMEM; + } else { + f->file_glob = strndup(str, p - str); + f->prog_glob = strdup(p + 1); + if (!f->file_glob || !f->prog_glob) { + free(f->file_glob); + free(f->prog_glob); + f->file_glob = f->prog_glob = NULL; + return -ENOMEM; + } + } + + *cnt = *cnt + 1; + return 0; +} + +static int append_filter_file(const char *path) +{ + char buf[1024]; + FILE *f; + int err = 0; + + f = fopen(path, "r"); + if (!f) { + err = -errno; + fprintf(stderr, "Failed to open '%s': %d\n", path, err); + return err; + } + + while (fscanf(f, " %1023[^\n]\n", buf) == 1) { + /* lines starting with # are comments, skip them */ + if (buf[0] == '\0' || buf[0] == '#') + continue; + /* lines starting with ! are negative match filters */ + if (buf[0] == '!') + err = append_filter(&env.deny_filters, &env.deny_filter_cnt, buf + 1); + else + err = append_filter(&env.allow_filters, &env.allow_filter_cnt, buf); + if (err) + goto cleanup; + } + +cleanup: + fclose(f); + return err; +} + static const struct stat_specs default_output_spec = { .spec_cnt = 7, .ids = { @@ -283,6 +452,9 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf int err = 0; void *tmp; + if (!should_process_prog(basename(filename), bpf_program__name(prog))) + return 0; + tmp = realloc(env.prog_stats, (env.prog_stat_cnt + 1) * sizeof(*env.prog_stats)); if (!tmp) return -ENOMEM; @@ -330,6 +502,9 @@ static int process_obj(const char *filename) LIBBPF_OPTS(bpf_object_open_opts, opts); int err = 0, prog_cnt = 0; + if (!should_process_file(basename(filename))) + return 0; + old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn); obj = bpf_object__open_file(filename, &opts); @@ -666,7 +841,10 @@ static int parse_stats_csv(const char *filename, struct stat_specs *specs, goto cleanup; } *statsp = tmp; + st = &(*statsp)[*stat_cntp]; + memset(st, 0, sizeof(*st)); + *stat_cntp += 1; } @@ -692,14 +870,34 @@ static int parse_stats_csv(const char *filename, struct stat_specs *specs, col++; } - if (!header && col < specs->spec_cnt) { + if (header) { + header = false; + continue; + } + + if (col < specs->spec_cnt) { fprintf(stderr, "Not enough columns in row #%d in '%s'\n", *stat_cntp, filename); err = -EINVAL; goto cleanup; } - header = false; + if (!st->file_name || !st->prog_name) { + fprintf(stderr, "Row #%d in '%s' is missing file and/or program name\n", + *stat_cntp, filename); + err = -EINVAL; + goto cleanup; + } + + /* in comparison mode we can only check filters after we + * parsed entire line; if row should be ignored we pretend we + * never parsed it + */ + if (!should_process_prog(st->file_name, st->prog_name)) { + free(st->file_name); + free(st->prog_name); + *stat_cntp -= 1; + } } if (!feof(f)) { @@ -1012,5 +1210,15 @@ int main(int argc, char **argv) for (i = 0; i < env.filename_cnt; i++) free(env.filenames[i]); free(env.filenames); + for (i = 0; i < env.allow_filter_cnt; i++) { + free(env.allow_filters[i].file_glob); + free(env.allow_filters[i].prog_glob); + } + free(env.allow_filters); + for (i = 0; i < env.deny_filter_cnt; i++) { + free(env.deny_filters[i].file_glob); + free(env.deny_filters[i].prog_glob); + } + free(env.deny_filters); return -err; } diff --git a/tools/testing/selftests/bpf/veristat.cfg b/tools/testing/selftests/bpf/veristat.cfg new file mode 100644 index 000000000000..1a385061618d --- /dev/null +++ b/tools/testing/selftests/bpf/veristat.cfg @@ -0,0 +1,17 @@ +# pre-canned list of rather complex selftests/bpf BPF object files to monitor +# BPF verifier's performance on +bpf_flow* +bpf_loop_bench* +loop* +netif_receive_skb* +profiler* +pyperf* +strobemeta* +test_cls_redirect* +test_l4lb +test_sysctl* +test_tcp_hdr_* +test_usdt* +test_verif_scale* +test_xdp_noinline* +xdp_synproxy* From b780d1671cf933caa3f67160f73261f10750f1a9 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Tue, 20 Sep 2022 19:14:09 +0300 Subject: [PATCH 083/143] selftests/bpf: Add liburandom_read.so to TEST_GEN_FILES Added urandom_read shared lib is missing from the list of installed files what makes urandom_read test after `make install` or `make gen_tar` broken. Add the library to TEST_GEN_FILES. The names in the list do not contain $(OUTPUT) since it's added by lib.mk code. Fixes: 00a0fa2d7d49 ("selftests/bpf: Add urandom_read shared lib and USDTs") Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920161409.129953-1-ykaliuta@redhat.com --- tools/testing/selftests/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 5898d3828b82..e6cf21fad69f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -86,6 +86,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ xskxceiver xdp_redirect_multi xdp_synproxy veristat TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/sign-file +TEST_GEN_FILES += liburandom_read.so # Emit succinct information message describing current building step # $1 - generic step name (e.g., CC, LINK, etc); From f5eb23b91c41a7ffc7ca7fe14f3c512360f02937 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 21 Sep 2022 15:00:34 +0800 Subject: [PATCH 084/143] selftests/bpf: Destroy the skeleton when CONFIG_PREEMPT is off Destroy the created skeleton when CONFIG_PREEMPT is off, else will be resource leak. Fixes: 73b97bc78b32 ("selftests/bpf: Test concurrent updates on bpf_task_storage_busy") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220921070035.2016413-2-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/map_tests/task_storage_map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c index aac08c85240b..7d050364efca 100644 --- a/tools/testing/selftests/bpf/map_tests/task_storage_map.c +++ b/tools/testing/selftests/bpf/map_tests/task_storage_map.c @@ -79,6 +79,7 @@ void test_task_storage_map_stress_lookup(void) /* Only for a fully preemptible kernel */ if (!skel->kconfig->CONFIG_PREEMPT) { printf("%s SKIP (no CONFIG_PREEMPT)\n", __func__); + read_bpf_task_storage_busy__destroy(skel); skips++; return; } From 103d002fb7d548fb1187e350f2b73788558128b9 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 21 Sep 2022 15:00:35 +0800 Subject: [PATCH 085/143] selftests/bpf: Free the allocated resources after test case succeeds Free the created fd or allocated bpf_object after test case succeeds, else there will be resource leaks. Spotted by using address sanitizer and checking the content of /proc/$pid/fd directory. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220921070035.2016413-3-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- .../bpf/map_tests/array_map_batch_ops.c | 2 ++ .../bpf/map_tests/htab_map_batch_ops.c | 2 ++ .../bpf/map_tests/lpm_trie_map_batch_ops.c | 2 ++ tools/testing/selftests/bpf/test_maps.c | 24 ++++++++++++------- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c index 78c76496b14a..b595556315bc 100644 --- a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -137,6 +138,7 @@ static void __test_map_lookup_and_update_batch(bool is_pcpu) free(keys); free(values); free(visited); + close(map_fd); } static void array_map_batch_ops(void) diff --git a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c index f807d53fd8dd..1230ccf90128 100644 --- a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -255,6 +256,7 @@ void __test_map_lookup_and_delete_batch(bool is_pcpu) free(visited); if (!is_pcpu) free(values); + close(map_fd); } void htab_map_batch_ops(void) diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c index 87d07b596e17..b66d56ddb7ef 100644 --- a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -150,4 +151,5 @@ void test_lpm_trie_map_batch_ops(void) free(keys); free(values); free(visited); + close(map_fd); } diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 289ff310e283..b73152822aa2 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -659,13 +659,13 @@ static void test_sockmap(unsigned int tasks, void *data) { struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_msg, *bpf_map_break; int map_fd_msg = 0, map_fd_rx = 0, map_fd_tx = 0, map_fd_break; + struct bpf_object *parse_obj, *verdict_obj, *msg_obj; int ports[] = {50200, 50201, 50202, 50204}; int err, i, fd, udp, sfd[6] = {0xdeadbeef}; u8 buf[20] = {0x0, 0x5, 0x3, 0x2, 0x1, 0x0}; int parse_prog, verdict_prog, msg_prog; struct sockaddr_in addr; int one = 1, s, sc, rc; - struct bpf_object *obj; struct timeval to; __u32 key, value; pid_t pid[tasks]; @@ -761,6 +761,7 @@ static void test_sockmap(unsigned int tasks, void *data) i, udp); goto out_sockmap; } + close(udp); /* Test update without programs */ for (i = 0; i < 6; i++) { @@ -823,27 +824,27 @@ static void test_sockmap(unsigned int tasks, void *data) /* Load SK_SKB program and Attach */ err = bpf_prog_test_load(SOCKMAP_PARSE_PROG, - BPF_PROG_TYPE_SK_SKB, &obj, &parse_prog); + BPF_PROG_TYPE_SK_SKB, &parse_obj, &parse_prog); if (err) { printf("Failed to load SK_SKB parse prog\n"); goto out_sockmap; } err = bpf_prog_test_load(SOCKMAP_TCP_MSG_PROG, - BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog); + BPF_PROG_TYPE_SK_MSG, &msg_obj, &msg_prog); if (err) { printf("Failed to load SK_SKB msg prog\n"); goto out_sockmap; } err = bpf_prog_test_load(SOCKMAP_VERDICT_PROG, - BPF_PROG_TYPE_SK_SKB, &obj, &verdict_prog); + BPF_PROG_TYPE_SK_SKB, &verdict_obj, &verdict_prog); if (err) { printf("Failed to load SK_SKB verdict prog\n"); goto out_sockmap; } - bpf_map_rx = bpf_object__find_map_by_name(obj, "sock_map_rx"); + bpf_map_rx = bpf_object__find_map_by_name(verdict_obj, "sock_map_rx"); if (!bpf_map_rx) { printf("Failed to load map rx from verdict prog\n"); goto out_sockmap; @@ -855,7 +856,7 @@ static void test_sockmap(unsigned int tasks, void *data) goto out_sockmap; } - bpf_map_tx = bpf_object__find_map_by_name(obj, "sock_map_tx"); + bpf_map_tx = bpf_object__find_map_by_name(verdict_obj, "sock_map_tx"); if (!bpf_map_tx) { printf("Failed to load map tx from verdict prog\n"); goto out_sockmap; @@ -867,7 +868,7 @@ static void test_sockmap(unsigned int tasks, void *data) goto out_sockmap; } - bpf_map_msg = bpf_object__find_map_by_name(obj, "sock_map_msg"); + bpf_map_msg = bpf_object__find_map_by_name(verdict_obj, "sock_map_msg"); if (!bpf_map_msg) { printf("Failed to load map msg from msg_verdict prog\n"); goto out_sockmap; @@ -879,7 +880,7 @@ static void test_sockmap(unsigned int tasks, void *data) goto out_sockmap; } - bpf_map_break = bpf_object__find_map_by_name(obj, "sock_map_break"); + bpf_map_break = bpf_object__find_map_by_name(verdict_obj, "sock_map_break"); if (!bpf_map_break) { printf("Failed to load map tx from verdict prog\n"); goto out_sockmap; @@ -1125,7 +1126,9 @@ static void test_sockmap(unsigned int tasks, void *data) } close(fd); close(map_fd_rx); - bpf_object__close(obj); + bpf_object__close(parse_obj); + bpf_object__close(msg_obj); + bpf_object__close(verdict_obj); return; out: for (i = 0; i < 6; i++) @@ -1283,8 +1286,11 @@ static void test_map_in_map(void) printf("Inner map mim.inner was not destroyed\n"); goto out_map_in_map; } + + close(fd); } + bpf_object__close(obj); return; out_map_in_map: From e0401dce5e28fb7118dbfd055c77d94433778a85 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Mon, 19 Sep 2022 17:53:30 +0000 Subject: [PATCH 086/143] selftests/bpf: Simplify cgroup_hierarchical_stats selftest The cgroup_hierarchical_stats selftest is complicated. It has to be, because it tests an entire workflow of recording, aggregating, and dumping cgroup stats. However, some of the complexity is unnecessary. The test now enables the memory controller in a cgroup hierarchy, invokes reclaim, measure reclaim time, THEN uses that reclaim time to test the stats collection and aggregation. We don't need to use such a complicated stat, as the context in which the stat is collected is orthogonal. Simplify the test by using a simple stat instead of reclaim time, the total number of times a process has ever entered a cgroup. This makes the test simpler and removes the dependency on the memory controller and the memory reclaim interface. Signed-off-by: Yosry Ahmed Signed-off-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20220919175330.890793-1-yosryahmed@google.com --- .../prog_tests/cgroup_hierarchical_stats.c | 168 ++++++++-------- .../bpf/progs/cgroup_hierarchical_stats.c | 179 ++++++------------ 2 files changed, 129 insertions(+), 218 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c index bed1661596f7..3bd27d2ea668 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c @@ -1,6 +1,22 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Functions to manage eBPF programs attached to cgroup subsystems + * This test makes sure BPF stats collection using rstat works correctly. + * The test uses 3 BPF progs: + * (a) counter: This BPF prog is invoked every time we attach a process to a + * cgroup and locklessly increments a percpu counter. + * The program then calls cgroup_rstat_updated() to inform rstat + * of an update on the (cpu, cgroup) pair. + * + * (b) flusher: This BPF prog is invoked when an rstat flush is ongoing, it + * aggregates all percpu counters to a total counter, and also + * propagates the changes to the ancestor cgroups. + * + * (c) dumper: This BPF prog is a cgroup_iter. It is used to output the total + * counter of a cgroup through reading a file in userspace. + * + * The test sets up a cgroup hierarchy, and the above programs. It spawns a few + * processes in the leaf cgroups and makes sure all the counters are aggregated + * correctly. * * Copyright 2022 Google LLC. */ @@ -21,8 +37,10 @@ #define PAGE_SIZE 4096 #define MB(x) (x << 20) +#define PROCESSES_PER_CGROUP 3 + #define BPFFS_ROOT "/sys/fs/bpf/" -#define BPFFS_VMSCAN BPFFS_ROOT"vmscan/" +#define BPFFS_ATTACH_COUNTERS BPFFS_ROOT "attach_counters/" #define CG_ROOT_NAME "root" #define CG_ROOT_ID 1 @@ -79,7 +97,7 @@ static int setup_bpffs(void) return err; /* Create a directory to contain stat files in bpffs */ - err = mkdir(BPFFS_VMSCAN, 0755); + err = mkdir(BPFFS_ATTACH_COUNTERS, 0755); if (!ASSERT_OK(err, "mkdir")) return err; @@ -89,7 +107,7 @@ static int setup_bpffs(void) static void cleanup_bpffs(void) { /* Remove created directory in bpffs */ - ASSERT_OK(rmdir(BPFFS_VMSCAN), "rmdir "BPFFS_VMSCAN); + ASSERT_OK(rmdir(BPFFS_ATTACH_COUNTERS), "rmdir "BPFFS_ATTACH_COUNTERS); /* Unmount bpffs, if it wasn't already mounted when we started */ if (mounted_bpffs) @@ -118,18 +136,6 @@ static int setup_cgroups(void) cgroups[i].fd = fd; cgroups[i].id = get_cgroup_id(cgroups[i].path); - - /* - * Enable memcg controller for the entire hierarchy. - * Note that stats are collected for all cgroups in a hierarchy - * with memcg enabled anyway, but are only exposed for cgroups - * that have memcg enabled. - */ - if (i < N_NON_LEAF_CGROUPS) { - err = enable_controllers(cgroups[i].path, "memory"); - if (!ASSERT_OK(err, "enable_controllers")) - return err; - } } return 0; } @@ -154,109 +160,85 @@ static void destroy_hierarchy(void) cleanup_bpffs(); } -static int reclaimer(const char *cgroup_path, size_t size) +static int attach_processes(void) { - static char size_buf[128]; - char *buf, *ptr; - int err; + int i, j, status; - /* Join cgroup in the parent process workdir */ - if (join_parent_cgroup(cgroup_path)) - return EACCES; - - /* Allocate memory */ - buf = malloc(size); - if (!buf) - return ENOMEM; - - /* Write to memory to make sure it's actually allocated */ - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) - *ptr = 1; - - /* Try to reclaim memory */ - snprintf(size_buf, 128, "%lu", size); - err = write_cgroup_file_parent(cgroup_path, "memory.reclaim", size_buf); - - free(buf); - /* memory.reclaim returns EAGAIN if the amount is not fully reclaimed */ - if (err && errno != EAGAIN) - return errno; - - return 0; -} - -static int induce_vmscan(void) -{ - int i, status; - - /* - * In every leaf cgroup, run a child process that allocates some memory - * and attempts to reclaim some of it. - */ + /* In every leaf cgroup, attach 3 processes */ for (i = N_NON_LEAF_CGROUPS; i < N_CGROUPS; i++) { - pid_t pid; + for (j = 0; j < PROCESSES_PER_CGROUP; j++) { + pid_t pid; - /* Create reclaimer child */ - pid = fork(); - if (pid == 0) { - status = reclaimer(cgroups[i].path, MB(5)); - exit(status); + /* Create child and attach to cgroup */ + pid = fork(); + if (pid == 0) { + if (join_parent_cgroup(cgroups[i].path)) + exit(EACCES); + exit(0); + } + + /* Cleanup child */ + waitpid(pid, &status, 0); + if (!ASSERT_TRUE(WIFEXITED(status), "child process exited")) + return 1; + if (!ASSERT_EQ(WEXITSTATUS(status), 0, + "child process exit code")) + return 1; } - - /* Cleanup reclaimer child */ - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFEXITED(status), "reclaimer exited"); - ASSERT_EQ(WEXITSTATUS(status), 0, "reclaim exit code"); } return 0; } static unsigned long long -get_cgroup_vmscan_delay(unsigned long long cgroup_id, const char *file_name) +get_attach_counter(unsigned long long cgroup_id, const char *file_name) { - unsigned long long vmscan = 0, id = 0; + unsigned long long attach_counter = 0, id = 0; static char buf[128], path[128]; /* For every cgroup, read the file generated by cgroup_iter */ - snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name); + snprintf(path, 128, "%s%s", BPFFS_ATTACH_COUNTERS, file_name); if (!ASSERT_OK(read_from_file(path, buf, 128), "read cgroup_iter")) return 0; /* Check the output file formatting */ - ASSERT_EQ(sscanf(buf, "cg_id: %llu, total_vmscan_delay: %llu\n", - &id, &vmscan), 2, "output format"); + ASSERT_EQ(sscanf(buf, "cg_id: %llu, attach_counter: %llu\n", + &id, &attach_counter), 2, "output format"); /* Check that the cgroup_id is displayed correctly */ ASSERT_EQ(id, cgroup_id, "cgroup_id"); - /* Check that the vmscan reading is non-zero */ - ASSERT_GT(vmscan, 0, "vmscan_reading"); - return vmscan; + /* Check that the counter is non-zero */ + ASSERT_GT(attach_counter, 0, "attach counter non-zero"); + return attach_counter; } -static void check_vmscan_stats(void) +static void check_attach_counters(void) { - unsigned long long vmscan_readings[N_CGROUPS], vmscan_root; + unsigned long long attach_counters[N_CGROUPS], root_attach_counter; int i; - for (i = 0; i < N_CGROUPS; i++) { - vmscan_readings[i] = get_cgroup_vmscan_delay(cgroups[i].id, - cgroups[i].name); - } + for (i = 0; i < N_CGROUPS; i++) + attach_counters[i] = get_attach_counter(cgroups[i].id, + cgroups[i].name); /* Read stats for root too */ - vmscan_root = get_cgroup_vmscan_delay(CG_ROOT_ID, CG_ROOT_NAME); + root_attach_counter = get_attach_counter(CG_ROOT_ID, CG_ROOT_NAME); + + /* Check that all leafs cgroups have an attach counter of 3 */ + for (i = N_NON_LEAF_CGROUPS; i < N_CGROUPS; i++) + ASSERT_EQ(attach_counters[i], PROCESSES_PER_CGROUP, + "leaf cgroup attach counter"); /* Check that child1 == child1_1 + child1_2 */ - ASSERT_EQ(vmscan_readings[1], vmscan_readings[3] + vmscan_readings[4], - "child1_vmscan"); + ASSERT_EQ(attach_counters[1], attach_counters[3] + attach_counters[4], + "child1_counter"); /* Check that child2 == child2_1 + child2_2 */ - ASSERT_EQ(vmscan_readings[2], vmscan_readings[5] + vmscan_readings[6], - "child2_vmscan"); + ASSERT_EQ(attach_counters[2], attach_counters[5] + attach_counters[6], + "child2_counter"); /* Check that test == child1 + child2 */ - ASSERT_EQ(vmscan_readings[0], vmscan_readings[1] + vmscan_readings[2], - "test_vmscan"); + ASSERT_EQ(attach_counters[0], attach_counters[1] + attach_counters[2], + "test_counter"); /* Check that root >= test */ - ASSERT_GE(vmscan_root, vmscan_readings[1], "root_vmscan"); + ASSERT_GE(root_attach_counter, attach_counters[1], "root_counter"); } /* Creates iter link and pins in bpffs, returns 0 on success, -errno on failure. @@ -278,12 +260,12 @@ static int setup_cgroup_iter(struct cgroup_hierarchical_stats *obj, linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY; opts.link_info = &linfo; opts.link_info_len = sizeof(linfo); - link = bpf_program__attach_iter(obj->progs.dump_vmscan, &opts); + link = bpf_program__attach_iter(obj->progs.dumper, &opts); if (!ASSERT_OK_PTR(link, "attach_iter")) return -EFAULT; /* Pin the link to a bpffs file */ - snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name); + snprintf(path, 128, "%s%s", BPFFS_ATTACH_COUNTERS, file_name); err = bpf_link__pin(link, path); ASSERT_OK(err, "pin cgroup_iter"); @@ -313,7 +295,7 @@ static int setup_progs(struct cgroup_hierarchical_stats **skel) if (!ASSERT_OK(err, "setup_cgroup_iter")) return err; - bpf_program__set_autoattach((*skel)->progs.dump_vmscan, false); + bpf_program__set_autoattach((*skel)->progs.dumper, false); err = cgroup_hierarchical_stats__attach(*skel); if (!ASSERT_OK(err, "attach")) return err; @@ -328,13 +310,13 @@ static void destroy_progs(struct cgroup_hierarchical_stats *skel) for (i = 0; i < N_CGROUPS; i++) { /* Delete files in bpffs that cgroup_iters are pinned in */ - snprintf(path, 128, "%s%s", BPFFS_VMSCAN, + snprintf(path, 128, "%s%s", BPFFS_ATTACH_COUNTERS, cgroups[i].name); ASSERT_OK(remove(path), "remove cgroup_iter pin"); } /* Delete root file in bpffs */ - snprintf(path, 128, "%s%s", BPFFS_VMSCAN, CG_ROOT_NAME); + snprintf(path, 128, "%s%s", BPFFS_ATTACH_COUNTERS, CG_ROOT_NAME); ASSERT_OK(remove(path), "remove cgroup_iter root pin"); cgroup_hierarchical_stats__destroy(skel); } @@ -347,9 +329,9 @@ void test_cgroup_hierarchical_stats(void) goto hierarchy_cleanup; if (setup_progs(&skel)) goto cleanup; - if (induce_vmscan()) + if (attach_processes()) goto cleanup; - check_vmscan_stats(); + check_attach_counters(); cleanup: destroy_progs(skel); hierarchy_cleanup: diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c index 8ab4253a1592..c74362854948 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Functions to manage eBPF programs attached to cgroup subsystems - * * Copyright 2022 Google LLC. */ #include "vmlinux.h" @@ -11,25 +9,14 @@ char _license[] SEC("license") = "GPL"; -/* - * Start times are stored per-task, not per-cgroup, as multiple tasks in one - * cgroup can perform reclaim concurrently. - */ -struct { - __uint(type, BPF_MAP_TYPE_TASK_STORAGE); - __uint(map_flags, BPF_F_NO_PREALLOC); - __type(key, int); - __type(value, __u64); -} vmscan_start_time SEC(".maps"); - -struct vmscan_percpu { +struct percpu_attach_counter { /* Previous percpu state, to figure out if we have new updates */ __u64 prev; /* Current percpu state */ __u64 state; }; -struct vmscan { +struct attach_counter { /* State propagated through children, pending aggregation */ __u64 pending; /* Total state, including all cpus and all children */ @@ -38,147 +25,94 @@ struct vmscan { struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); - __uint(max_entries, 100); + __uint(max_entries, 1024); __type(key, __u64); - __type(value, struct vmscan_percpu); -} pcpu_cgroup_vmscan_elapsed SEC(".maps"); + __type(value, struct percpu_attach_counter); +} percpu_attach_counters SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 100); + __uint(max_entries, 1024); __type(key, __u64); - __type(value, struct vmscan); -} cgroup_vmscan_elapsed SEC(".maps"); + __type(value, struct attach_counter); +} attach_counters SEC(".maps"); extern void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) __ksym; extern void cgroup_rstat_flush(struct cgroup *cgrp) __ksym; -static struct cgroup *task_memcg(struct task_struct *task) -{ - int cgrp_id; - -#if __has_builtin(__builtin_preserve_enum_value) - cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id, memory_cgrp_id); -#else - cgrp_id = memory_cgrp_id; -#endif - return task->cgroups->subsys[cgrp_id]->cgroup; -} - static uint64_t cgroup_id(struct cgroup *cgrp) { return cgrp->kn->id; } -static int create_vmscan_percpu_elem(__u64 cg_id, __u64 state) +static int create_percpu_attach_counter(__u64 cg_id, __u64 state) { - struct vmscan_percpu pcpu_init = {.state = state, .prev = 0}; + struct percpu_attach_counter pcpu_init = {.state = state, .prev = 0}; - return bpf_map_update_elem(&pcpu_cgroup_vmscan_elapsed, &cg_id, + return bpf_map_update_elem(&percpu_attach_counters, &cg_id, &pcpu_init, BPF_NOEXIST); } -static int create_vmscan_elem(__u64 cg_id, __u64 state, __u64 pending) +static int create_attach_counter(__u64 cg_id, __u64 state, __u64 pending) { - struct vmscan init = {.state = state, .pending = pending}; + struct attach_counter init = {.state = state, .pending = pending}; - return bpf_map_update_elem(&cgroup_vmscan_elapsed, &cg_id, + return bpf_map_update_elem(&attach_counters, &cg_id, &init, BPF_NOEXIST); } -SEC("tp_btf/mm_vmscan_memcg_reclaim_begin") -int BPF_PROG(vmscan_start, int order, gfp_t gfp_flags) +SEC("fentry/cgroup_attach_task") +int BPF_PROG(counter, struct cgroup *dst_cgrp, struct task_struct *leader, + bool threadgroup) { - struct task_struct *task = bpf_get_current_task_btf(); - __u64 *start_time_ptr; + __u64 cg_id = cgroup_id(dst_cgrp); + struct percpu_attach_counter *pcpu_counter = bpf_map_lookup_elem( + &percpu_attach_counters, + &cg_id); - start_time_ptr = bpf_task_storage_get(&vmscan_start_time, task, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); - if (start_time_ptr) - *start_time_ptr = bpf_ktime_get_ns(); - return 0; -} - -SEC("tp_btf/mm_vmscan_memcg_reclaim_end") -int BPF_PROG(vmscan_end, unsigned long nr_reclaimed) -{ - struct vmscan_percpu *pcpu_stat; - struct task_struct *current = bpf_get_current_task_btf(); - struct cgroup *cgrp; - __u64 *start_time_ptr; - __u64 current_elapsed, cg_id; - __u64 end_time = bpf_ktime_get_ns(); - - /* - * cgrp is the first parent cgroup of current that has memcg enabled in - * its subtree_control, or NULL if memcg is disabled in the entire tree. - * In a cgroup hierarchy like this: - * a - * / \ - * b c - * If "a" has memcg enabled, while "b" doesn't, then processes in "b" - * will accumulate their stats directly to "a". This makes sure that no - * stats are lost from processes in leaf cgroups that don't have memcg - * enabled, but only exposes stats for cgroups that have memcg enabled. - */ - cgrp = task_memcg(current); - if (!cgrp) + if (pcpu_counter) + pcpu_counter->state += 1; + else if (create_percpu_attach_counter(cg_id, 1)) return 0; - cg_id = cgroup_id(cgrp); - start_time_ptr = bpf_task_storage_get(&vmscan_start_time, current, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); - if (!start_time_ptr) - return 0; - - current_elapsed = end_time - *start_time_ptr; - pcpu_stat = bpf_map_lookup_elem(&pcpu_cgroup_vmscan_elapsed, - &cg_id); - if (pcpu_stat) - pcpu_stat->state += current_elapsed; - else if (create_vmscan_percpu_elem(cg_id, current_elapsed)) - return 0; - - cgroup_rstat_updated(cgrp, bpf_get_smp_processor_id()); + cgroup_rstat_updated(dst_cgrp, bpf_get_smp_processor_id()); return 0; } SEC("fentry/bpf_rstat_flush") -int BPF_PROG(vmscan_flush, struct cgroup *cgrp, struct cgroup *parent, int cpu) +int BPF_PROG(flusher, struct cgroup *cgrp, struct cgroup *parent, int cpu) { - struct vmscan_percpu *pcpu_stat; - struct vmscan *total_stat, *parent_stat; + struct percpu_attach_counter *pcpu_counter; + struct attach_counter *total_counter, *parent_counter; __u64 cg_id = cgroup_id(cgrp); __u64 parent_cg_id = parent ? cgroup_id(parent) : 0; - __u64 *pcpu_vmscan; __u64 state; __u64 delta = 0; /* Add CPU changes on this level since the last flush */ - pcpu_stat = bpf_map_lookup_percpu_elem(&pcpu_cgroup_vmscan_elapsed, - &cg_id, cpu); - if (pcpu_stat) { - state = pcpu_stat->state; - delta += state - pcpu_stat->prev; - pcpu_stat->prev = state; + pcpu_counter = bpf_map_lookup_percpu_elem(&percpu_attach_counters, + &cg_id, cpu); + if (pcpu_counter) { + state = pcpu_counter->state; + delta += state - pcpu_counter->prev; + pcpu_counter->prev = state; } - total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id); - if (!total_stat) { - if (create_vmscan_elem(cg_id, delta, 0)) + total_counter = bpf_map_lookup_elem(&attach_counters, &cg_id); + if (!total_counter) { + if (create_attach_counter(cg_id, delta, 0)) return 0; - goto update_parent; } /* Collect pending stats from subtree */ - if (total_stat->pending) { - delta += total_stat->pending; - total_stat->pending = 0; + if (total_counter->pending) { + delta += total_counter->pending; + total_counter->pending = 0; } /* Propagate changes to this cgroup's total */ - total_stat->state += delta; + total_counter->state += delta; update_parent: /* Skip if there are no changes to propagate, or no parent */ @@ -186,20 +120,20 @@ update_parent: return 0; /* Propagate changes to cgroup's parent */ - parent_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, - &parent_cg_id); - if (parent_stat) - parent_stat->pending += delta; + parent_counter = bpf_map_lookup_elem(&attach_counters, + &parent_cg_id); + if (parent_counter) + parent_counter->pending += delta; else - create_vmscan_elem(parent_cg_id, 0, delta); + create_attach_counter(parent_cg_id, 0, delta); return 0; } SEC("iter.s/cgroup") -int BPF_PROG(dump_vmscan, struct bpf_iter_meta *meta, struct cgroup *cgrp) +int BPF_PROG(dumper, struct bpf_iter_meta *meta, struct cgroup *cgrp) { struct seq_file *seq = meta->seq; - struct vmscan *total_stat; + struct attach_counter *total_counter; __u64 cg_id = cgrp ? cgroup_id(cgrp) : 0; /* Do nothing for the terminal call */ @@ -209,18 +143,13 @@ int BPF_PROG(dump_vmscan, struct bpf_iter_meta *meta, struct cgroup *cgrp) /* Flush the stats to make sure we get the most updated numbers */ cgroup_rstat_flush(cgrp); - total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id); - if (!total_stat) { - BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: 0\n", + total_counter = bpf_map_lookup_elem(&attach_counters, &cg_id); + if (!total_counter) { + BPF_SEQ_PRINTF(seq, "cg_id: %llu, attach_counter: 0\n", cg_id); } else { - BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: %llu\n", - cg_id, total_stat->state); + BPF_SEQ_PRINTF(seq, "cg_id: %llu, attach_counter: %llu\n", + cg_id, total_counter->state); } - - /* - * We only dump stats for one cgroup here, so return 1 to stop - * iteration after the first cgroup. - */ - return 1; + return 0; } From e588c116df6ca64a295017571151992c76d03132 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Thu, 22 Sep 2022 14:28:44 +0800 Subject: [PATCH 087/143] libbpf: Add pathname_concat() helper Move snprintf and len check to common helper pathname_concat() to make the code simpler. Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1663828124-10437-1-git-send-email-wangyufen@huawei.com --- tools/lib/bpf/libbpf.c | 76 ++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 47 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 67bc18506150..e691f08a297f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2097,19 +2097,30 @@ static bool get_map_field_int(const char *map_name, const struct btf *btf, return true; } +static int pathname_concat(char *buf, size_t buf_sz, const char *path, const char *name) +{ + int len; + + len = snprintf(buf, buf_sz, "%s/%s", path, name); + if (len < 0) + return -EINVAL; + if (len >= buf_sz) + return -ENAMETOOLONG; + + return 0; +} + static int build_map_pin_path(struct bpf_map *map, const char *path) { char buf[PATH_MAX]; - int len; + int err; if (!path) path = "/sys/fs/bpf"; - len = snprintf(buf, PATH_MAX, "%s/%s", path, bpf_map__name(map)); - if (len < 0) - return -EINVAL; - else if (len >= PATH_MAX) - return -ENAMETOOLONG; + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return err; return bpf_map__set_pin_path(map, buf); } @@ -7968,17 +7979,9 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path) continue; if (path) { - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, - bpf_map__name(map)); - if (len < 0) { - err = -EINVAL; + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) goto err_unpin_maps; - } else if (len >= PATH_MAX) { - err = -ENAMETOOLONG; - goto err_unpin_maps; - } sanitize_pin_path(buf); pin_path = buf; } else if (!map->pin_path) { @@ -8016,14 +8019,9 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) char buf[PATH_MAX]; if (path) { - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, - bpf_map__name(map)); - if (len < 0) - return libbpf_err(-EINVAL); - else if (len >= PATH_MAX) - return libbpf_err(-ENAMETOOLONG); + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return libbpf_err(err); sanitize_pin_path(buf); pin_path = buf; } else if (!map->pin_path) { @@ -8041,6 +8039,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) int bpf_object__pin_programs(struct bpf_object *obj, const char *path) { struct bpf_program *prog; + char buf[PATH_MAX]; int err; if (!obj) @@ -8052,17 +8051,9 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path) } bpf_object__for_each_program(prog, obj) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); - if (len < 0) { - err = -EINVAL; + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) goto err_unpin_programs; - } else if (len >= PATH_MAX) { - err = -ENAMETOOLONG; - goto err_unpin_programs; - } err = bpf_program__pin(prog, buf); if (err) @@ -8073,13 +8064,7 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path) err_unpin_programs: while ((prog = bpf_object__prev_program(obj, prog))) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); - if (len < 0) - continue; - else if (len >= PATH_MAX) + if (pathname_concat(buf, sizeof(buf), path, prog->name)) continue; bpf_program__unpin(prog, buf); @@ -8098,13 +8083,10 @@ int bpf_object__unpin_programs(struct bpf_object *obj, const char *path) bpf_object__for_each_program(prog, obj) { char buf[PATH_MAX]; - int len; - len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); - if (len < 0) - return libbpf_err(-EINVAL); - else if (len >= PATH_MAX) - return libbpf_err(-ENAMETOOLONG); + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) + return libbpf_err(err); err = bpf_program__unpin(prog, buf); if (err) From dbdea9b36fb61da3b9a1be0dd63542e2bfd3e5d7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Sep 2022 16:05:59 -0700 Subject: [PATCH 088/143] libbpf: restore memory layout of bpf_object_open_opts When attach_prog_fd field was removed in libbpf 1.0 and replaced with `long: 0` placeholder, it actually shifted all the subsequent fields by 8 byte. This is due to `long: 0` promising to adjust next field's offset to long-aligned offset. But in this case we were already long-aligned as pin_root_path is a pointer. So `long: 0` had no effect, and thus didn't feel the gap created by removed attach_prog_fd. Non-zero bitfield should have been used instead. I validated using pahole. Originally kconfig field was at offset 40. With `long: 0` it's at offset 32, which is wrong. With this change it's back at offset 40. While technically libbpf 1.0 is allowed to break backwards compatibility and applications should have been recompiled against libbpf 1.0 headers, but given how trivial it is to preserve memory layout, let's fix this. Reported-by: Grant Seltzer Richman Fixes: 146bf811f5ac ("libbpf: remove most other deprecated high-level APIs") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220923230559.666608-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e2d8c17f2e85..eee883f007f9 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -118,7 +118,9 @@ struct bpf_object_open_opts { * auto-pinned to that path on load; defaults to "/sys/fs/bpf". */ const char *pin_root_path; - long :0; + + __u32 :32; /* stub out now removed attach_prog_fd */ + /* Additional kernel config content that augments and overrides * system Kconfig for CONFIG_xxx externs. */ From 067f4f291c2063d86abe0a526ef211e03a4f1258 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Sep 2022 10:59:09 -0700 Subject: [PATCH 089/143] selftests/bpf: add sign-file to .gitignore Add sign-file to .gitignore to avoid accidentally checking it in. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220923175913.3272430-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 3b288562963e..07d2d0a8c5cb 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -40,6 +40,7 @@ test_cpp /runqslower /bench /veristat +/sign-file *.ko *.tmp xskxceiver From c2488d70ceee352611e55943c25abf30117e3b67 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Sep 2022 10:59:10 -0700 Subject: [PATCH 090/143] selftests/bpf: make veristat's verifier log parsing faster and more robust Make sure veristat doesn't spend ridiculous amount of time parsing verifier stats from verifier log, especially for very large logs or truncated logs (e.g., when verifier returns -ENOSPC due to too small buffer). For this, parse lines from the end of the log and make sure we parse only up to 100 last lines, where stats should be, if at all. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220923175913.3272430-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 29 ++++++++++++++++++-------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 51030234b60a..77bdfd6fe302 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -419,19 +419,30 @@ static void free_verif_stats(struct verif_stats *stats, size_t stat_cnt) static char verif_log_buf[64 * 1024]; -static int parse_verif_log(const char *buf, size_t buf_sz, struct verif_stats *s) +#define MAX_PARSED_LOG_LINES 100 + +static int parse_verif_log(char * const buf, size_t buf_sz, struct verif_stats *s) { - const char *next; - int pos; + const char *cur; + int pos, lines; - for (pos = 0; buf[0]; buf = next) { - if (buf[0] == '\n') - buf++; - next = strchrnul(&buf[pos], '\n'); + buf[buf_sz - 1] = '\0'; - if (1 == sscanf(buf, "verification time %ld usec\n", &s->stats[DURATION])) + for (pos = strlen(buf) - 1, lines = 0; pos >= 0 && lines < MAX_PARSED_LOG_LINES; lines++) { + /* find previous endline or otherwise take the start of log buf */ + for (cur = &buf[pos]; cur > buf && cur[0] != '\n'; cur--, pos--) { + } + /* next time start from end of previous line (or pos goes to <0) */ + pos--; + /* if we found endline, point right after endline symbol; + * otherwise, stay at the beginning of log buf + */ + if (cur[0] == '\n') + cur++; + + if (1 == sscanf(cur, "verification time %ld usec\n", &s->stats[DURATION])) continue; - if (6 == sscanf(buf, "processed %ld insns (limit %*d) max_states_per_insn %ld total_states %ld peak_states %ld mark_read %ld", + if (6 == sscanf(cur, "processed %ld insns (limit %*d) max_states_per_insn %ld total_states %ld peak_states %ld mark_read %ld", &s->stats[TOTAL_INSNS], &s->stats[MAX_STATES_PER_INSN], &s->stats[TOTAL_STATES], From 518fee8bfaf2c628007909c0fc5336930b9b6ee4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Sep 2022 10:59:11 -0700 Subject: [PATCH 091/143] selftests/bpf: make veristat skip non-BPF and failing-to-open BPF objects Make veristat ignore non-BPF object files. This allows simpler mass-verification (e.g., `sudo ./veristat *.bpf.o` in selftests/bpf directory). Note that `sudo ./veristat *.o` would also work, but with selftests's multiple copies of BPF object files (.bpf.o and .bpf.linked{1,2,3}.o) it's 4x slower. Also, given some of BPF object files could be incomplete in the sense that they are meant to be statically linked into final BPF object file (like linked_maps, linked_funcs, linked_vars), note such instances in stderr, but proceed anyways. This seems like a better trade off between completely silently ignoring BPF object file and aborting mass-verification altogether. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220923175913.3272430-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 78 +++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 77bdfd6fe302..f09dd143a8df 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include enum stat_id { VERDICT, @@ -78,6 +80,11 @@ static struct env { struct filter *deny_filters; int allow_filter_cnt; int deny_filter_cnt; + + int files_processed; + int files_skipped; + int progs_processed; + int progs_skipped; } env; static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) @@ -226,8 +233,41 @@ static bool should_process_file(const char *filename) return false; } -static bool should_process_prog(const char *filename, const char *prog_name) +static bool is_bpf_obj_file(const char *path) { + Elf64_Ehdr *ehdr; + int fd, err = -EINVAL; + Elf *elf = NULL; + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return true; /* we'll fail later and propagate error */ + + /* ensure libelf is initialized */ + (void)elf_version(EV_CURRENT); + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) + goto cleanup; + + if (elf_kind(elf) != ELF_K_ELF || gelf_getclass(elf) != ELFCLASS64) + goto cleanup; + + ehdr = elf64_getehdr(elf); + /* Old LLVM set e_machine to EM_NONE */ + if (!ehdr || ehdr->e_type != ET_REL || (ehdr->e_machine && ehdr->e_machine != EM_BPF)) + goto cleanup; + + err = 0; +cleanup: + if (elf) + elf_end(elf); + close(fd); + return err == 0; +} + +static bool should_process_prog(const char *path, const char *prog_name) { + const char *filename = basename(path); int i; if (env.deny_filter_cnt > 0) { @@ -303,7 +343,7 @@ static int append_filter_file(const char *path) f = fopen(path, "r"); if (!f) { err = -errno; - fprintf(stderr, "Failed to open '%s': %d\n", path, err); + fprintf(stderr, "Failed to open filters in '%s': %d\n", path, err); return err; } @@ -463,8 +503,10 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf int err = 0; void *tmp; - if (!should_process_prog(basename(filename), bpf_program__name(prog))) + if (!should_process_prog(filename, bpf_program__name(prog))) { + env.progs_skipped++; return 0; + } tmp = realloc(env.prog_stats, (env.prog_stat_cnt + 1) * sizeof(*env.prog_stats)); if (!tmp) @@ -487,6 +529,7 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf verif_log_buf[0] = '\0'; err = bpf_object__load(obj); + env.progs_processed++; stats->file_name = strdup(basename(filename)); stats->prog_name = strdup(bpf_program__name(prog)); @@ -513,18 +556,37 @@ static int process_obj(const char *filename) LIBBPF_OPTS(bpf_object_open_opts, opts); int err = 0, prog_cnt = 0; - if (!should_process_file(basename(filename))) + if (!should_process_file(basename(filename))) { + if (env.verbose) + printf("Skipping '%s' due to filters...\n", filename); + env.files_skipped++; return 0; + } + if (!is_bpf_obj_file(filename)) { + if (env.verbose) + printf("Skipping '%s' as it's not a BPF object file...\n", filename); + env.files_skipped++; + return 0; + } old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn); obj = bpf_object__open_file(filename, &opts); if (!obj) { - err = -errno; - fprintf(stderr, "Failed to open '%s': %d\n", filename, err); + /* if libbpf can't open BPF object file, it could be because + * that BPF object file is incomplete and has to be statically + * linked into a final BPF object file; instead of bailing + * out, report it into stderr, mark it as skipped, and + * proceeed + */ + fprintf(stderr, "Failed to open '%s': %d\n", filename, -errno); + env.files_skipped++; + err = 0; goto cleanup; } + env.files_processed++; + bpf_object__for_each_program(prog, obj) { prog_cnt++; } @@ -732,8 +794,8 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last if (last && fmt == RESFMT_TABLE) { output_header_underlines(); - printf("Done. Processed %d object files, %d programs.\n", - env.filename_cnt, env.prog_stat_cnt); + printf("Done. Processed %d files, %d programs. Skipped %d files, %d programs.\n", + env.files_processed, env.files_skipped, env.progs_processed, env.progs_skipped); } } From c511d009ceb8cd980e4a823b7ca74abbdc7cdccc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Sep 2022 10:59:12 -0700 Subject: [PATCH 092/143] selftests/bpf: emit processing progress and add quiet mode to veristat Emit "Processing ..." for each BPF object file to be processed, to show progress. But also add -q (--quiet) flag to silence such messages. Doing something more clever (like overwriting same output line) is to cumbersome and easily breakable if there is any other console output (e.g., errors from libbpf). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220923175913.3272430-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index f09dd143a8df..85a77f1dd863 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -63,6 +63,7 @@ static struct env { char **filenames; int filename_cnt; bool verbose; + bool quiet; enum resfmt out_fmt; bool comparison_mode; @@ -107,6 +108,7 @@ const char argp_program_doc[] = static const struct argp_option opts[] = { { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, { "verbose", 'v', NULL, 0, "Verbose mode" }, + { "quiet", 'q', NULL, 0, "Quiet mode" }, { "emit", 'e', "SPEC", 0, "Specify stats to be emitted" }, { "sort", 's', "SPEC", 0, "Specify sort order" }, { "output-format", 'o', "FMT", 0, "Result output format (table, csv), default is table." }, @@ -131,6 +133,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'v': env.verbose = true; break; + case 'q': + env.quiet = true; + break; case 'e': err = parse_stats(arg, &env.output_spec); if (err) @@ -569,8 +574,10 @@ static int process_obj(const char *filename) return 0; } - old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn); + if (!env.quiet && env.out_fmt == RESFMT_TABLE) + printf("Processing '%s'...\n", basename(filename)); + old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn); obj = bpf_object__open_file(filename, &opts); if (!obj) { /* if libbpf can't open BPF object file, it could be because @@ -1268,6 +1275,12 @@ int main(int argc, char **argv) if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) return 1; + if (env.verbose && env.quiet) { + fprintf(stderr, "Verbose and quiet modes are incompatible, please specify just one or neither!\n"); + argp_help(&argp, stderr, ARGP_HELP_USAGE, "veristat"); + return 1; + } + if (env.output_spec.spec_cnt == 0) env.output_spec = default_output_spec; if (env.sort_spec.spec_cnt == 0) From e310efc5ddde04c41aa0501b5a7235b134c5fc6c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Sep 2022 10:59:13 -0700 Subject: [PATCH 093/143] selftests/bpf: allow to adjust BPF verifier log level in veristat Add -l (--log-level) flag to override default BPF verifier log lever. This only matters in verbose mode, which is the mode in which veristat emits verifier log for each processed BPF program. This is important because for successfully verified BPF programs log_level 1 is empty, as BPF verifier truncates all the successfully verified paths. So -l2 is the only way to actually get BPF verifier log in practice. It looks sometihng like this: [vmuser@archvm bpf]$ sudo ./veristat xdp_tx.bpf.o -vl2 Processing 'xdp_tx.bpf.o'... PROCESSING xdp_tx.bpf.o/xdp_tx, DURATION US: 19, VERDICT: success, VERIFIER LOG: func#0 @0 0: R1=ctx(off=0,imm=0) R10=fp0 ; return XDP_TX; 0: (b4) w0 = 3 ; R0_w=3 1: (95) exit verification time 19 usec stack depth 0 processed 2 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 File Program Verdict Duration (us) Total insns Total states Peak states ------------ ------- ------- ------------- ----------- ------------ ----------- xdp_tx.bpf.o xdp_tx success 19 2 0 0 ------------ ------- ------- ------------- ----------- ------------ ----------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220923175913.3272430-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 85a77f1dd863..b0d83a28e348 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -64,6 +64,7 @@ static struct env { int filename_cnt; bool verbose; bool quiet; + int log_level; enum resfmt out_fmt; bool comparison_mode; @@ -108,6 +109,7 @@ const char argp_program_doc[] = static const struct argp_option opts[] = { { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, { "verbose", 'v', NULL, 0, "Verbose mode" }, + { "log-level", 'l', "LEVEL", 0, "Verifier log level (default 0 for normal mode, 1 for verbose mode)" }, { "quiet", 'q', NULL, 0, "Quiet mode" }, { "emit", 'e', "SPEC", 0, "Specify stats to be emitted" }, { "sort", 's', "SPEC", 0, "Specify sort order" }, @@ -156,6 +158,14 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return -EINVAL; } break; + case 'l': + errno = 0; + env.log_level = strtol(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid log level: %s\n", arg); + argp_usage(state); + } + break; case 'C': env.comparison_mode = true; break; @@ -526,7 +536,7 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf if (!buf) return -ENOMEM; bpf_program__set_log_buf(prog, buf, buf_sz); - bpf_program__set_log_level(prog, 1 | 4); /* stats + log */ + bpf_program__set_log_level(prog, env.log_level | 4); /* stats + log */ } else { bpf_program__set_log_buf(prog, buf, buf_sz); bpf_program__set_log_level(prog, 4); /* only verifier stats */ @@ -1280,6 +1290,8 @@ int main(int argc, char **argv) argp_help(&argp, stderr, ARGP_HELP_USAGE, "veristat"); return 1; } + if (env.verbose && env.log_level == 0) + env.log_level = 1; if (env.output_spec.spec_cnt == 0) env.output_spec = default_output_spec; From 3f8ef65af927db247418d4e1db49164d7a158fc5 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Tue, 23 Aug 2022 21:37:54 +0800 Subject: [PATCH 094/143] net: If sock is dead don't access sock's sk_wq in sk_stream_wait_memory Fixes the below NULL pointer dereference: [...] [ 14.471200] Call Trace: [ 14.471562] [ 14.471882] lock_acquire+0x245/0x2e0 [ 14.472416] ? remove_wait_queue+0x12/0x50 [ 14.473014] ? _raw_spin_lock_irqsave+0x17/0x50 [ 14.473681] _raw_spin_lock_irqsave+0x3d/0x50 [ 14.474318] ? remove_wait_queue+0x12/0x50 [ 14.474907] remove_wait_queue+0x12/0x50 [ 14.475480] sk_stream_wait_memory+0x20d/0x340 [ 14.476127] ? do_wait_intr_irq+0x80/0x80 [ 14.476704] do_tcp_sendpages+0x287/0x600 [ 14.477283] tcp_bpf_push+0xab/0x260 [ 14.477817] tcp_bpf_sendmsg_redir+0x297/0x500 [ 14.478461] ? __local_bh_enable_ip+0x77/0xe0 [ 14.479096] tcp_bpf_send_verdict+0x105/0x470 [ 14.479729] tcp_bpf_sendmsg+0x318/0x4f0 [ 14.480311] sock_sendmsg+0x2d/0x40 [ 14.480822] ____sys_sendmsg+0x1b4/0x1c0 [ 14.481390] ? copy_msghdr_from_user+0x62/0x80 [ 14.482048] ___sys_sendmsg+0x78/0xb0 [ 14.482580] ? vmf_insert_pfn_prot+0x91/0x150 [ 14.483215] ? __do_fault+0x2a/0x1a0 [ 14.483738] ? do_fault+0x15e/0x5d0 [ 14.484246] ? __handle_mm_fault+0x56b/0x1040 [ 14.484874] ? lock_is_held_type+0xdf/0x130 [ 14.485474] ? find_held_lock+0x2d/0x90 [ 14.486046] ? __sys_sendmsg+0x41/0x70 [ 14.486587] __sys_sendmsg+0x41/0x70 [ 14.487105] ? intel_pmu_drain_pebs_core+0x350/0x350 [ 14.487822] do_syscall_64+0x34/0x80 [ 14.488345] entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] The test scenario has the following flow: thread1 thread2 ----------- --------------- tcp_bpf_sendmsg tcp_bpf_send_verdict tcp_bpf_sendmsg_redir sock_close tcp_bpf_push_locked __sock_release tcp_bpf_push //inet_release do_tcp_sendpages sock->ops->release sk_stream_wait_memory // tcp_close sk_wait_event sk->sk_prot->close release_sock(__sk); *** lock_sock(sk); __tcp_close sock_orphan(sk) sk->sk_wq = NULL release_sock **** lock_sock(__sk); remove_wait_queue(sk_sleep(sk), &wait); sk_sleep(sk) //NULL pointer dereference &rcu_dereference_raw(sk->sk_wq)->wait While waiting for memory in thread1, the socket is released with its wait queue because thread2 has closed it. This caused by tcp_bpf_send_verdict didn't increase the f_count of psock->sk_redir->sk_socket->file in thread1. We should check if SOCK_DEAD flag is set on wakeup in sk_stream_wait_memory before accessing the wait queue. Suggested-by: Jakub Sitnicki Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/20220823133755.314697-2-liujian56@huawei.com --- net/core/stream.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/stream.c b/net/core/stream.c index ccc083cdef23..1105057ce00a 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -159,7 +159,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - remove_wait_queue(sk_sleep(sk), &wait); + if (!sock_flag(sk, SOCK_DEAD)) + remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: From 043a7356dbd0f44b2a2161649d89f4a43f3b0180 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Tue, 23 Aug 2022 21:37:55 +0800 Subject: [PATCH 095/143] selftests/bpf: Add wait send memory test for sockmap redirect Add one test for wait redirect sock's send memory test for sockmap. Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220823133755.314697-3-liujian56@huawei.com --- tools/testing/selftests/bpf/test_sockmap.c | 42 ++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index dcb038e342d8..e768181a1bd7 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -138,6 +138,7 @@ struct sockmap_options { bool data_test; bool drop_expected; bool check_recved_len; + bool tx_wait_mem; int iov_count; int iov_length; int rate; @@ -578,6 +579,10 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, sent = sendmsg(fd, &msg, flags); if (!drop && sent < 0) { + if (opt->tx_wait_mem && errno == EACCES) { + errno = 0; + goto out_errno; + } perror("sendmsg loop error"); goto out_errno; } else if (drop && sent >= 0) { @@ -644,6 +649,15 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, goto out_errno; } + if (opt->tx_wait_mem) { + FD_ZERO(&w); + FD_SET(fd, &w); + slct = select(max_fd + 1, NULL, NULL, &w, &timeout); + errno = 0; + close(fd); + goto out_errno; + } + errno = 0; if (peek_flag) { flags |= MSG_PEEK; @@ -752,6 +766,22 @@ static int sendmsg_test(struct sockmap_options *opt) return err; } + if (opt->tx_wait_mem) { + struct timeval timeout; + int rxtx_buf_len = 1024; + + timeout.tv_sec = 3; + timeout.tv_usec = 0; + + err = setsockopt(c2, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(struct timeval)); + err |= setsockopt(c2, SOL_SOCKET, SO_SNDBUFFORCE, &rxtx_buf_len, sizeof(int)); + err |= setsockopt(p2, SOL_SOCKET, SO_RCVBUFFORCE, &rxtx_buf_len, sizeof(int)); + if (err) { + perror("setsockopt failed()"); + return errno; + } + } + rxpid = fork(); if (rxpid == 0) { if (txmsg_pop || txmsg_start_pop) @@ -788,6 +818,9 @@ static int sendmsg_test(struct sockmap_options *opt) return errno; } + if (opt->tx_wait_mem) + close(c2); + txpid = fork(); if (txpid == 0) { if (opt->sendpage) @@ -1452,6 +1485,14 @@ static void test_txmsg_redir(int cgrp, struct sockmap_options *opt) test_send(opt, cgrp); } +static void test_txmsg_redir_wait_sndmem(int cgrp, struct sockmap_options *opt) +{ + txmsg_redir = 1; + opt->tx_wait_mem = true; + test_send_large(opt, cgrp); + opt->tx_wait_mem = false; +} + static void test_txmsg_drop(int cgrp, struct sockmap_options *opt) { txmsg_drop = 1; @@ -1800,6 +1841,7 @@ static int populate_progs(char *bpf_file) struct _test test[] = { {"txmsg test passthrough", test_txmsg_pass}, {"txmsg test redirect", test_txmsg_redir}, + {"txmsg test redirect wait send mem", test_txmsg_redir_wait_sndmem}, {"txmsg test drop", test_txmsg_drop}, {"txmsg test ingress redirect", test_txmsg_ingress_redir}, {"txmsg test skb", test_txmsg_skb}, From bec217197b412d74168c6a42fc0f76d0cc9cad00 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Wed, 7 Sep 2022 15:13:11 +0800 Subject: [PATCH 096/143] skmsg: Schedule psock work if the cached skb exists on the psock In sk_psock_backlog function, for ingress direction skb, if no new data packet arrives after the skb is cached, the cached skb does not have a chance to be added to the receive queue of psock. As a result, the cached skb cannot be received by the upper-layer application. Fix this by reschedule the psock work to dispose the cached skb in sk_msg_recvmsg function. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220907071311.60534-1-liujian56@huawei.com --- net/core/skmsg.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 188f8558d27d..ca70525621c7 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -434,8 +434,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (copied + copy > len) copy = len - copied; copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; + if (!copy) { + copied = copied ? copied : -EFAULT; + goto out; + } copied += copy; if (likely(!peek)) { @@ -455,7 +457,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, * didn't copy the entire length lets just break. */ if (copy != sge->length) - return copied; + goto out; sk_msg_iter_var_next(i); } @@ -477,7 +479,9 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } msg_rx = sk_psock_peek_msg(psock); } - +out: + if (psock->work_state.skb && copied > 0) + schedule_work(&psock->work); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); From bf7a87f1075f67c286f794519f0fedfa8b0b18cc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Sep 2022 17:33:35 +0200 Subject: [PATCH 097/143] kprobes: Add new KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag Adding KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag to indicate that attach address is on function entry. This is used in following changes in get_func_ip helper to return correct function address. Acked-by: Masami Hiramatsu (Google) Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220926153340.1621984-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/kprobes.h | 1 + kernel/kprobes.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 55041d2f884d..a0b92be98984 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -103,6 +103,7 @@ struct kprobe { * this flag is only for optimized_kprobe. */ #define KPROBE_FLAG_FTRACE 8 /* probe is using ftrace */ +#define KPROBE_FLAG_ON_FUNC_ENTRY 16 /* probe is on the function entry */ /* Has this kprobe gone ? */ static inline bool kprobe_gone(struct kprobe *p) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 08350e35aba2..51adc3c94503 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1606,9 +1606,10 @@ int register_kprobe(struct kprobe *p) struct kprobe *old_p; struct module *probed_mod; kprobe_opcode_t *addr; + bool on_func_entry; /* Adjust probe address from symbol */ - addr = kprobe_addr(p); + addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry); if (IS_ERR(addr)) return PTR_ERR(addr); p->addr = addr; @@ -1628,6 +1629,9 @@ int register_kprobe(struct kprobe *p) mutex_lock(&kprobe_mutex); + if (on_func_entry) + p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY; + old_p = get_kprobe(p->addr); if (old_p) { /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ From 9d68c19c57d690547cde977bb3d9ccd3ceb6afe9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Sep 2022 17:33:36 +0200 Subject: [PATCH 098/143] ftrace: Keep the resolved addr in kallsyms_callback Keeping the resolved 'addr' in kallsyms_callback, instead of taking ftrace_location value, because we depend on symbol address in the cookie related code. With CONFIG_X86_KERNEL_IBT option the ftrace_location value differs from symbol address, which screwes the symbol address cookies matching. There are 2 users of this function: - bpf_kprobe_multi_link_attach for which this fix is for - get_ftrace_locations which is used by register_fprobe_syms this function needs to get symbols resolved to addresses, but does not need 'ftrace location addresses' at this point there's another ftrace location translation in the path done by ftrace_set_filter_ips call: register_fprobe_syms addrs = get_ftrace_locations register_fprobe_ips(addrs) ... ftrace_set_filter_ips ... __ftrace_match_addr ip = ftrace_location(ip); ... Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220926153340.1621984-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 439e2ab6905e..447d2e2a8549 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -8265,8 +8265,7 @@ static int kallsyms_callback(void *data, const char *name, if (args->addrs[idx]) return 0; - addr = ftrace_location(addr); - if (!addr) + if (!ftrace_location(addr)) return 0; args->addrs[idx] = addr; From 4d854f4f31ec4b317dfe316111ddac0fab81f735 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Sep 2022 17:33:37 +0200 Subject: [PATCH 099/143] bpf: Use given function address for trampoline ip arg Using function address given at the generation time as the trampoline ip argument. This way we get directly the function address that we need, so we don't need to: - read the ip from the stack - subtract X86_PATCH_SIZE - subtract ENDBR_INSN_SIZE if CONFIG_X86_KERNEL_IBT is enabled which is not even implemented yet ;-) Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220926153340.1621984-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ae89f4143eb4..d4a6183197e9 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -662,7 +662,7 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, */ emit_mov_imm32(&prog, false, dst_reg, imm32_lo); } else { - /* movabsq %rax, imm64 */ + /* movabsq rax, imm64 */ EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); EMIT(imm32_lo, 4); EMIT(imm32_hi, 4); @@ -2039,13 +2039,14 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call) + void *func_addr) { int ret, i, nr_args = m->nr_args, extra_nregs = 0; int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + void *orig_call = func_addr; u8 **branches = NULL; u8 *prog; bool save_ret; @@ -2126,12 +2127,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (flags & BPF_TRAMP_F_IP_ARG) { /* Store IP address of the traced function: - * mov rax, QWORD PTR [rbp + 8] - * sub rax, X86_PATCH_SIZE + * movabsq rax, func_addr * mov QWORD PTR [rbp - ip_off], rax */ - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); - EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE); + emit_mov_imm64(&prog, BPF_REG_0, (long) func_addr >> 32, (u32) (long) func_addr); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off); } From c09eb2e578eb1668bbc84dc07e8d8bd6f04b9a02 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Sep 2022 17:33:38 +0200 Subject: [PATCH 100/143] bpf: Adjust kprobe_multi entry_ip for CONFIG_X86_KERNEL_IBT Martynas reported bpf_get_func_ip returning +4 address when CONFIG_X86_KERNEL_IBT option is enabled. When CONFIG_X86_KERNEL_IBT is enabled we'll have endbr instruction at the function entry, which screws return value of bpf_get_func_ip() helper that should return the function address. There's short term workaround for kprobe_multi bpf program made by Alexei [1], but we need this fixup also for bpf_get_attach_cookie, that returns cookie based on the entry_ip value. Moving the fixup in the fprobe handler, so both bpf_get_func_ip and bpf_get_attach_cookie get expected function address when CONFIG_X86_KERNEL_IBT option is enabled. Also renaming kprobe_multi_link_handler entry_ip argument to fentry_ip so it's clearer this is an ftrace __fentry__ ip. [1] commit 7f0059b58f02 ("selftests/bpf: Fix kprobe_multi test.") Cc: Peter Zijlstra Reported-by: Martynas Pumputis Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220926153340.1621984-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 20 +++++++++++++++++-- .../selftests/bpf/progs/kprobe_multi.c | 4 +--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b05f0310dbd3..ebd1b348beb3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1028,6 +1028,22 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; +#ifdef CONFIG_X86_KERNEL_IBT +static unsigned long get_entry_ip(unsigned long fentry_ip) +{ + u32 instr; + + /* Being extra safe in here in case entry ip is on the page-edge. */ + if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1)) + return fentry_ip; + if (is_endbr(instr)) + fentry_ip -= ENDBR_INSN_SIZE; + return fentry_ip; +} +#else +#define get_entry_ip(fentry_ip) fentry_ip +#endif + BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { struct kprobe *kp = kprobe_running(); @@ -2600,13 +2616,13 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, } static void -kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, +kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, struct pt_regs *regs) { struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, entry_ip, regs); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi.c b/tools/testing/selftests/bpf/progs/kprobe_multi.c index 08f95a8155d1..98c3399e15c0 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi.c @@ -36,15 +36,13 @@ __u64 kretprobe_test6_result = 0; __u64 kretprobe_test7_result = 0; __u64 kretprobe_test8_result = 0; -extern bool CONFIG_X86_KERNEL_IBT __kconfig __weak; - static void kprobe_multi_check(void *ctx, bool is_return) { if (bpf_get_current_pid_tgid() >> 32 != pid) return; __u64 cookie = test_cookie ? bpf_get_attach_cookie(ctx) : 0; - __u64 addr = bpf_get_func_ip(ctx) - (CONFIG_X86_KERNEL_IBT ? 4 : 0); + __u64 addr = bpf_get_func_ip(ctx); #define SET(__var, __addr, __cookie) ({ \ if (((const void *) addr == __addr) && \ From 0e253f7e558a3e250902ba2034091e0185448836 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Sep 2022 17:33:39 +0200 Subject: [PATCH 101/143] bpf: Return value in kprobe get_func_ip only for entry address Changing return value of kprobe's version of bpf_get_func_ip to return zero if the attach address is not on the function's entry point. For kprobes attached in the middle of the function we can't easily get to the function address especially now with the CONFIG_X86_KERNEL_IBT support. If user cares about current IP for kprobes attached within the function body, they can get it with PT_REGS_IP(ctx). Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martynas Pumputis Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220926153340.1621984-6-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/trace/bpf_trace.c | 5 ++++- tools/include/uapi/linux/bpf.h | 1 + tools/testing/selftests/bpf/progs/get_func_ip_test.c | 4 ++-- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ead35f39f185..d6bd10759eaf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4951,6 +4951,7 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). * * u64 bpf_get_attach_cookie(void *ctx) * Description diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ebd1b348beb3..688552df95ca 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1048,7 +1048,10 @@ BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { struct kprobe *kp = kprobe_running(); - return kp ? (uintptr_t)kp->addr : 0; + if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY)) + return 0; + + return get_entry_ip((uintptr_t)kp->addr); } static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ead35f39f185..d6bd10759eaf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4951,6 +4951,7 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). * * u64 bpf_get_attach_cookie(void *ctx) * Description diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c index a587aeca5ae0..6db70757bc8b 100644 --- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -69,7 +69,7 @@ int test6(struct pt_regs *ctx) { __u64 addr = bpf_get_func_ip(ctx); - test6_result = (const void *) addr == &bpf_fentry_test6 + 5; + test6_result = (const void *) addr == 0; return 0; } @@ -79,6 +79,6 @@ int test7(struct pt_regs *ctx) { __u64 addr = bpf_get_func_ip(ctx); - test7_result = (const void *) addr == &bpf_fentry_test7 + 5; + test7_result = (const void *) addr == 0; return 0; } From 738c345b74b8d11edd01b6cee5628c6b8368d8ea Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Sep 2022 17:33:40 +0200 Subject: [PATCH 102/143] selftests/bpf: Fix get_func_ip offset test for CONFIG_X86_KERNEL_IBT With CONFIG_X86_KERNEL_IBT enabled the test for kprobe with offset won't work because of the extra endbr instruction. As suggested by Andrii adding CONFIG_X86_KERNEL_IBT detection and using appropriate offset value based on that. Also removing test7 program, because it does the same as test6. Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220926153340.1621984-7-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/get_func_ip_test.c | 63 +++++++++++++++---- .../selftests/bpf/progs/get_func_ip_test.c | 23 +++---- 2 files changed, 62 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c index 938dbd4d7c2f..fede8ef58b5b 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c @@ -2,7 +2,7 @@ #include #include "get_func_ip_test.skel.h" -void test_get_func_ip_test(void) +static void test_function_entry(void) { struct get_func_ip_test *skel = NULL; int err, prog_fd; @@ -12,14 +12,6 @@ void test_get_func_ip_test(void) if (!ASSERT_OK_PTR(skel, "get_func_ip_test__open")) return; - /* test6 is x86_64 specifc because of the instruction - * offset, disabling it for all other archs - */ -#ifndef __x86_64__ - bpf_program__set_autoload(skel->progs.test6, false); - bpf_program__set_autoload(skel->progs.test7, false); -#endif - err = get_func_ip_test__load(skel); if (!ASSERT_OK(err, "get_func_ip_test__load")) goto cleanup; @@ -43,11 +35,56 @@ void test_get_func_ip_test(void) ASSERT_EQ(skel->bss->test3_result, 1, "test3_result"); ASSERT_EQ(skel->bss->test4_result, 1, "test4_result"); ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); -#ifdef __x86_64__ - ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); - ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); -#endif cleanup: get_func_ip_test__destroy(skel); } + +/* test6 is x86_64 specific because of the instruction + * offset, disabling it for all other archs + */ +#ifdef __x86_64__ +static void test_function_body(void) +{ + struct get_func_ip_test *skel = NULL; + LIBBPF_OPTS(bpf_test_run_opts, topts); + LIBBPF_OPTS(bpf_kprobe_opts, kopts); + struct bpf_link *link6 = NULL; + int err, prog_fd; + + skel = get_func_ip_test__open(); + if (!ASSERT_OK_PTR(skel, "get_func_ip_test__open")) + return; + + bpf_program__set_autoload(skel->progs.test6, true); + + err = get_func_ip_test__load(skel); + if (!ASSERT_OK(err, "get_func_ip_test__load")) + goto cleanup; + + kopts.offset = skel->kconfig->CONFIG_X86_KERNEL_IBT ? 9 : 5; + + link6 = bpf_program__attach_kprobe_opts(skel->progs.test6, "bpf_fentry_test6", &kopts); + if (!ASSERT_OK_PTR(link6, "link6")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.test1); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); + +cleanup: + bpf_link__destroy(link6); + get_func_ip_test__destroy(skel); +} +#else +#define test_function_body() +#endif + +void test_get_func_ip_test(void) +{ + test_function_entry(); + test_function_body(); +} diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c index 6db70757bc8b..8559e698b40d 100644 --- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -2,6 +2,7 @@ #include #include #include +#include char _license[] SEC("license") = "GPL"; @@ -13,6 +14,16 @@ extern const void bpf_modify_return_test __ksym; extern const void bpf_fentry_test6 __ksym; extern const void bpf_fentry_test7 __ksym; +extern bool CONFIG_X86_KERNEL_IBT __kconfig __weak; + +/* This function is here to have CONFIG_X86_KERNEL_IBT + * used and added to object BTF. + */ +int unused(void) +{ + return CONFIG_X86_KERNEL_IBT ? 0 : 1; +} + __u64 test1_result = 0; SEC("fentry/bpf_fentry_test1") int BPF_PROG(test1, int a) @@ -64,7 +75,7 @@ int BPF_PROG(test5, int a, int *b, int ret) } __u64 test6_result = 0; -SEC("kprobe/bpf_fentry_test6+0x5") +SEC("?kprobe") int test6(struct pt_regs *ctx) { __u64 addr = bpf_get_func_ip(ctx); @@ -72,13 +83,3 @@ int test6(struct pt_regs *ctx) test6_result = (const void *) addr == 0; return 0; } - -__u64 test7_result = 0; -SEC("kprobe/bpf_fentry_test7+5") -int test7(struct pt_regs *ctx) -{ - __u64 addr = bpf_get_func_ip(ctx); - - test7_result = (const void *) addr == 0; - return 0; -} From 19c02415da2345d0dda2b5c4495bc17cc14b18b5 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 26 Sep 2022 11:47:38 -0700 Subject: [PATCH 103/143] bpf: use bpf_prog_pack for bpf_dispatcher Allocate bpf_dispatcher with bpf_prog_pack_alloc so that bpf_dispatcher can share pages with bpf programs. arch_prepare_bpf_dispatcher() is updated to provide a RW buffer as working area for arch code to write to. This also fixes CPA W^X warnning like: CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ... Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220926184739.3512547-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 16 ++++++++-------- include/linux/bpf.h | 3 ++- include/linux/filter.h | 5 +++++ kernel/bpf/core.c | 9 +++++++-- kernel/bpf/dispatcher.c | 27 +++++++++++++++++++++------ 5 files changed, 43 insertions(+), 17 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d4a6183197e9..35796db58116 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2242,7 +2242,7 @@ cleanup: return ret; } -static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) +static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf) { u8 *jg_reloc, *prog = *pprog; int pivot, err, jg_bytes = 1; @@ -2258,12 +2258,12 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a]); err = emit_cond_near_jump(&prog, /* je func */ - (void *)progs[a], prog, + (void *)progs[a], image + (prog - buf), X86_JE); if (err) return err; - emit_indirect_jump(&prog, 2 /* rdx */, prog); + emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf)); *pprog = prog; return 0; @@ -2288,7 +2288,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) jg_reloc = prog; err = emit_bpf_dispatcher(&prog, a, a + pivot, /* emit lower_part */ - progs); + progs, image, buf); if (err) return err; @@ -2302,7 +2302,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes); err = emit_bpf_dispatcher(&prog, a + pivot + 1, /* emit upper_part */ - b, progs); + b, progs, image, buf); if (err) return err; @@ -2322,12 +2322,12 @@ static int cmp_ips(const void *a, const void *b) return 0; } -int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs) { - u8 *prog = image; + u8 *prog = buf; sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL); - return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs); + return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf); } struct x64_jit_data { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index edd43edb27d6..9ae155c75014 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -946,6 +946,7 @@ struct bpf_dispatcher { struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX]; int num_progs; void *image; + void *rw_image; u32 image_off; struct bpf_ksym ksym; }; @@ -964,7 +965,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); -int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); +int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 98e28126c24b..efc42a6e3aed 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1023,6 +1023,8 @@ extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); +void bpf_jit_fill_hole_with_zero(void *area, unsigned int size); + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -1035,6 +1037,9 @@ void bpf_jit_free(struct bpf_prog *fp); struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_prog_pack_free(struct bpf_binary_header *hdr); + static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { return list_empty(&fp->aux->ksym.lnode) || diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d1be78c28619..711fd293b6de 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -825,6 +825,11 @@ struct bpf_prog_pack { unsigned long bitmap[]; }; +void bpf_jit_fill_hole_with_zero(void *area, unsigned int size) +{ + memset(area, 0, size); +} + #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) static DEFINE_MUTEX(pack_mutex); @@ -864,7 +869,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins return pack; } -static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; @@ -905,7 +910,7 @@ out: return ptr; } -static void bpf_prog_pack_free(struct bpf_binary_header *hdr) +void bpf_prog_pack_free(struct bpf_binary_header *hdr) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 2444bd15cc2d..fa64b80b8bca 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -85,12 +85,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, return false; } -int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs) { return -ENOTSUPP; } -static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) +static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf) { s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; int i; @@ -99,12 +99,12 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) if (d->progs[i].prog) *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; } - return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs); + return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs); } static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) { - void *old, *new; + void *old, *new, *tmp; u32 noff; int err; @@ -117,8 +117,14 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) } new = d->num_progs ? d->image + noff : NULL; + tmp = d->num_progs ? d->rw_image + noff : NULL; if (new) { - if (bpf_dispatcher_prepare(d, new)) + /* Prepare the dispatcher in d->rw_image. Then use + * bpf_arch_text_copy to update d->image, which is RO+X. + */ + if (bpf_dispatcher_prepare(d, new, tmp)) + return; + if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2))) return; } @@ -140,9 +146,18 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_jit_alloc_exec_page(); + d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero); if (!d->image) goto out; + d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!d->rw_image) { + u32 size = PAGE_SIZE; + + bpf_arch_text_copy(d->image, &size, sizeof(size)); + bpf_prog_pack_free((struct bpf_binary_header *)d->image); + d->image = NULL; + goto out; + } bpf_image_ksym_add(d->image, &d->ksym); } From 5b0d1c7bd5722467960829af51d523f5a6ffd848 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 26 Sep 2022 11:47:39 -0700 Subject: [PATCH 104/143] bpf: Enforce W^X for bpf trampoline Mark the trampoline as RO+X after arch_prepare_bpf_trampoline, so that the trampoine follows W^X rule strictly. This will turn off warnings like CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ... Also remove bpf_jit_alloc_exec_page(), since it is not used any more. Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220926184739.3512547-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 - kernel/bpf/trampoline.c | 22 +++++----------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9ae155c75014..5161fac0513f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1008,7 +1008,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void *bpf_jit_alloc_exec_page(void); void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 41b67eb83ab3..6f7b939321d6 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -116,22 +116,6 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } -void *bpf_jit_alloc_exec_page(void) -{ - void *image; - - image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!image) - return NULL; - - set_vm_flush_reset_perms(image); - /* Keep image as writeable. The alternative is to keep flipping ro/rw - * every time new program is attached or detached. - */ - set_memory_x((long)image, 1); - return image; -} - void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; @@ -404,9 +388,10 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) goto out_free_im; err = -ENOMEM; - im->image = image = bpf_jit_alloc_exec_page(); + im->image = image = bpf_jit_alloc_exec(PAGE_SIZE); if (!image) goto out_uncharge; + set_vm_flush_reset_perms(image); err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) @@ -483,6 +468,9 @@ again: if (err < 0) goto out; + set_memory_ro((long)im->image, 1); + set_memory_x((long)im->image, 1); + WARN_ON(tr->cur_image && tr->selector == 0); WARN_ON(!tr->cur_image && tr->selector); if (tr->cur_image) From 6a4ab8869d0bfcf83d7c5184561df8235553cf28 Mon Sep 17 00:00:00 2001 From: Jon Doron Date: Sun, 25 Sep 2022 10:04:31 +0300 Subject: [PATCH 105/143] libbpf: Fix the case of running as non-root with capabilities When running rootless with special capabilities like: FOWNER / DAC_OVERRIDE / DAC_READ_SEARCH The "access" API will not make the proper check if there is really access to a file or not. >From the access man page: " The check is done using the calling process's real UID and GID, rather than the effective IDs as is done when actually attempting an operation (e.g., open(2)) on the file. Similarly, for the root user, the check uses the set of permitted capabilities rather than the set of effective capabilities; ***and for non-root users, the check uses an empty set of capabilities.*** " What that means is that for non-root user the access API will not do the proper validation if the process really has permission to a file or not. To resolve this this patch replaces all the access API calls with faccessat with AT_EACCESS flag. Signed-off-by: Jon Doron Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220925070431.1313680-1-arilou@gmail.com --- tools/lib/bpf/btf.c | 2 +- tools/lib/bpf/libbpf.c | 6 +++--- tools/lib/bpf/usdt.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index b4d9a96c3c1b..d88647da2c7f 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4664,7 +4664,7 @@ struct btf *btf__load_vmlinux_btf(void) for (i = 0; i < ARRAY_SIZE(locations); i++) { snprintf(path, PATH_MAX, locations[i], buf.release); - if (access(path, R_OK)) + if (faccessat(AT_FDCWD, path, R_OK, AT_EACCESS)) continue; btf = btf__parse(path, NULL); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e691f08a297f..184ce1684dcd 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -884,7 +884,7 @@ __u32 get_kernel_version(void) __u32 major, minor, patch; struct utsname info; - if (access(ubuntu_kver_file, R_OK) == 0) { + if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) == 0) { FILE *f; f = fopen(ubuntu_kver_file, "r"); @@ -9904,7 +9904,7 @@ static bool use_debugfs(void) static int has_debugfs = -1; if (has_debugfs < 0) - has_debugfs = access(DEBUGFS, F_OK) == 0; + has_debugfs = faccessat(AT_FDCWD, DEBUGFS, F_OK, AT_EACCESS) == 0; return has_debugfs == 1; } @@ -10721,7 +10721,7 @@ static int resolve_full_path(const char *file, char *result, size_t result_sz) continue; snprintf(result, result_sz, "%.*s/%s", seg_len, s, file); /* ensure it has required permissions */ - if (access(result, perm) < 0) + if (faccessat(AT_FDCWD, result, perm, AT_EACCESS) < 0) continue; pr_debug("resolved '%s' to '%s'\n", file, result); return 0; diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index d18e37982344..e83b497c2245 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -282,7 +282,7 @@ struct usdt_manager *usdt_manager_new(struct bpf_object *obj) * If this is not supported, USDTs with semaphores will not be supported. * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") */ - man->has_sema_refcnt = access(ref_ctr_sysfs_path, F_OK) == 0; + man->has_sema_refcnt = faccessat(AT_FDCWD, ref_ctr_sysfs_path, F_OK, AT_EACCESS) == 0; return man; } From 2702c789996d9001cb60ef1dac055aca84d3c51a Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Mon, 26 Sep 2022 12:23:20 +0300 Subject: [PATCH 106/143] selftests/bpf: Fix passing arguments via function in test_kmod.sh Since the tests are run in a function $@ there actually contains the function arguments, not the script ones. Pass "$@" to the function as well. Fixes: 272d1f4cfa3c ("selftests: bpf: test_kmod.sh: Pass parameters to the module") Signed-off-by: Yauheni Kaliuta Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220926092320.564631-1-ykaliuta@redhat.com --- tools/testing/selftests/bpf/test_kmod.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh index d4a4279c0181..50dca53ac536 100755 --- a/tools/testing/selftests/bpf/test_kmod.sh +++ b/tools/testing/selftests/bpf/test_kmod.sh @@ -29,6 +29,7 @@ test_run() sysctl -w net.core.bpf_jit_harden=$2 2>&1 > /dev/null echo "[ JIT enabled:$1 hardened:$2 ]" + shift 2 dmesg -C if [ -f ${OUTPUT}/lib/test_bpf.ko ]; then insmod ${OUTPUT}/lib/test_bpf.ko "$@" 2> /dev/null @@ -64,9 +65,9 @@ test_restore() rc=0 test_save -test_run 0 0 -test_run 1 0 -test_run 1 1 -test_run 1 2 +test_run 0 0 "$@" +test_run 1 0 "$@" +test_run 1 1 "$@" +test_run 1 2 "$@" test_restore exit $rc From 87dbdc230d162bf9ee1ac77c8ade178b6b1e199e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 26 Sep 2022 21:29:39 -0700 Subject: [PATCH 107/143] libbpf: Don't require full struct enum64 in UAPI headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the requirement for system-wide kernel UAPI headers to provide full struct btf_enum64 definition. This is an unexpected requirement that slipped in libbpf 1.0 and put unnecessary pressure ([0]) on users to have a bleeding-edge kernel UAPI header from unreleased Linux 6.0. To achieve this, we forward declare struct btf_enum64. But that's not enough as there is btf_enum64_value() helper that expects to know the layout of struct btf_enum64. So we get a bit creative with reinterpreting memory layout as array of __u32 and accesing lo32/hi32 fields as array elements. Alternative way would be to have a local pointer variable for anonymous struct with exactly the same layout as struct btf_enum64, but that gets us into C++ compiler errors complaining about invalid type casts. So play it safe, if ugly. [0] Closes: https://github.com/libbpf/libbpf/issues/562 Fixes: d90ec262b35b ("libbpf: Add enum64 support for btf_dump") Reported-by: Toke Høiland-Jørgensen Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20220927042940.147185-1-andrii@kernel.org --- tools/lib/bpf/btf.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index ae543144ee30..8e6880d91c84 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -486,6 +486,8 @@ static inline struct btf_enum *btf_enum(const struct btf_type *t) return (struct btf_enum *)(t + 1); } +struct btf_enum64; + static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) { return (struct btf_enum64 *)(t + 1); @@ -493,7 +495,28 @@ static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) static inline __u64 btf_enum64_value(const struct btf_enum64 *e) { - return ((__u64)e->val_hi32 << 32) | e->val_lo32; + /* struct btf_enum64 is introduced in Linux 6.0, which is very + * bleeding-edge. Here we are avoiding relying on struct btf_enum64 + * definition coming from kernel UAPI headers to support wider range + * of system-wide kernel headers. + * + * Given this header can be also included from C++ applications, that + * further restricts C tricks we can use (like using compatible + * anonymous struct). So just treat struct btf_enum64 as + * a three-element array of u32 and access second (lo32) and third + * (hi32) elements directly. + * + * For reference, here is a struct btf_enum64 definition: + * + * const struct btf_enum64 { + * __u32 name_off; + * __u32 val_lo32; + * __u32 val_hi32; + * }; + */ + const __u32 *e64 = (const __u32 *)e; + + return ((__u64)e64[2] << 32) | e64[1]; } static inline struct btf_member *btf_members(const struct btf_type *t) From f0d74c4da1f060d2a66976193712a5e6abd361f5 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:53 -0700 Subject: [PATCH 108/143] bpf: Parameterize task iterators. Allow creating an iterator that loops through resources of one thread/process. People could only create iterators to loop through all resources of files, vma, and tasks in the system, even though they were interested in only the resources of a specific task or process. Passing the additional parameters, people can now create an iterator to go through all resources or only the resources of a task. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-2-kuifeng@fb.com --- include/linux/bpf.h | 25 +++++ include/uapi/linux/bpf.h | 6 ++ kernel/bpf/task_iter.c | 188 +++++++++++++++++++++++++++++---- tools/include/uapi/linux/bpf.h | 6 ++ 4 files changed, 203 insertions(+), 22 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5161fac0513f..0f3eaf3ed98c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1796,6 +1796,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags); extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } +/* + * The task type of iterators. + * + * For BPF task iterators, they can be parameterized with various + * parameters to visit only some of tasks. + * + * BPF_TASK_ITER_ALL (default) + * Iterate over resources of every task. + * + * BPF_TASK_ITER_TID + * Iterate over resources of a task/tid. + * + * BPF_TASK_ITER_TGID + * Iterate over resources of every task of a process / task group. + */ +enum bpf_iter_task_type { + BPF_TASK_ITER_ALL = 0, + BPF_TASK_ITER_TID, + BPF_TASK_ITER_TGID, +}; + struct bpf_iter_aux_info { /* for map_elem iter */ struct bpf_map *map; @@ -1805,6 +1826,10 @@ struct bpf_iter_aux_info { struct cgroup *start; /* starting cgroup */ enum bpf_cgroup_iter_order order; } cgroup; + struct { + enum bpf_iter_task_type type; + u32 pid; + } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d6bd10759eaf..455b21a53aac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -110,6 +110,12 @@ union bpf_iter_link_info { __u32 cgroup_fd; __u64 cgroup_id; } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8c921799def4..8b2f47e7139d 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -12,6 +12,9 @@ struct bpf_iter_seq_task_common { struct pid_namespace *ns; + enum bpf_iter_task_type type; + u32 pid; + u32 pid_visiting; }; struct bpf_iter_seq_task_info { @@ -22,18 +25,115 @@ struct bpf_iter_seq_task_info { u32 tid; }; -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, +static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, + u32 *tid, + bool skip_if_dup_files) +{ + struct task_struct *task, *next_task; + struct pid *pid; + u32 saved_tid; + + if (!*tid) { + /* The first time, the iterator calls this function. */ + pid = find_pid_ns(common->pid, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_TGID); + if (!task) + return NULL; + + *tid = common->pid; + common->pid_visiting = common->pid; + + return task; + } + + /* If the control returns to user space and comes back to the + * kernel again, *tid and common->pid_visiting should be the + * same for task_seq_start() to pick up the correct task. + */ + if (*tid == common->pid_visiting) { + pid = find_pid_ns(common->pid_visiting, common->ns); + task = get_pid_task(pid, PIDTYPE_PID); + + return task; + } + + pid = find_pid_ns(common->pid_visiting, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return NULL; + +retry: + if (!pid_alive(task)) { + put_task_struct(task); + return NULL; + } + + next_task = next_thread(task); + put_task_struct(task); + if (!next_task) + return NULL; + + saved_tid = *tid; + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); + if (!*tid || *tid == common->pid) { + /* Run out of tasks of a process. The tasks of a + * thread_group are linked as circular linked list. + */ + *tid = saved_tid; + return NULL; + } + + get_task_struct(next_task); + common->pid_visiting = *tid; + + if (skip_if_dup_files && task->files == task->group_leader->files) { + task = next_task; + goto retry; + } + + return next_task; +} + +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, u32 *tid, bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; + if (common->type == BPF_TASK_ITER_TID) { + if (*tid && *tid != common->pid) + return NULL; + rcu_read_lock(); + pid = find_pid_ns(common->pid, common->ns); + if (pid) { + task = get_pid_task(pid, PIDTYPE_TGID); + *tid = common->pid; + } + rcu_read_unlock(); + + return task; + } + + if (common->type == BPF_TASK_ITER_TGID) { + rcu_read_lock(); + task = task_group_seq_get_next(common, tid, skip_if_dup_files); + rcu_read_unlock(); + + return task; + } + rcu_read_lock(); retry: - pid = find_ge_pid(*tid, ns); + pid = find_ge_pid(*tid, common->ns); if (pid) { - *tid = pid_nr_ns(pid, ns); + *tid = pid_nr_ns(pid, common->ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; @@ -56,7 +156,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -73,7 +173,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -117,6 +217,41 @@ static void task_seq_stop(struct seq_file *seq, void *v) put_task_struct((struct task_struct *)v); } +static int bpf_iter_attach_task(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + unsigned int flags; + struct pid *pid; + pid_t tgid; + + if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1) + return -EINVAL; + + aux->task.type = BPF_TASK_ITER_ALL; + if (linfo->task.tid != 0) { + aux->task.type = BPF_TASK_ITER_TID; + aux->task.pid = linfo->task.tid; + } + if (linfo->task.pid != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + aux->task.pid = linfo->task.pid; + } + if (linfo->task.pid_fd != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + tgid = pid_nr_ns(pid, task_active_pid_ns(current)); + aux->task.pid = tgid; + put_pid(pid); + } + + return 0; +} + static const struct seq_operations task_seq_ops = { .start = task_seq_start, .next = task_seq_next, @@ -137,8 +272,7 @@ struct bpf_iter_seq_task_file_info { static struct file * task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { - struct pid_namespace *ns = info->common.ns; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; struct task_struct *curr_task; unsigned int curr_fd = info->fd; @@ -151,21 +285,18 @@ again: curr_task = info->task; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { info->task = NULL; - info->tid = curr_tid; return NULL; } - /* set info->task and info->tid */ + /* set info->task */ info->task = curr_task; - if (curr_tid == info->tid) { + if (saved_tid == info->tid) curr_fd = info->fd; - } else { - info->tid = curr_tid; + else curr_fd = 0; - } } rcu_read_lock(); @@ -186,9 +317,15 @@ again: /* the current task is done, go to the next task */ rcu_read_unlock(); put_task_struct(curr_task); + + if (info->common.type == BPF_TASK_ITER_TID) { + info->task = NULL; + return NULL; + } + info->task = NULL; info->fd = 0; - curr_tid = ++(info->tid); + saved_tid = ++(info->tid); goto again; } @@ -269,6 +406,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) struct bpf_iter_seq_task_common *common = priv_data; common->ns = get_pid_ns(task_active_pid_ns(current)); + common->type = aux->task.type; + common->pid = aux->task.pid; + return 0; } @@ -307,11 +447,10 @@ enum bpf_task_vma_iter_find_op { static struct vm_area_struct * task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) { - struct pid_namespace *ns = info->common.ns; enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to * the task_struct, and holds read lock on vma->mm->mmap_lock. @@ -371,14 +510,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) } } else { again: - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { - info->tid = curr_tid + 1; + info->tid++; goto finish; } - if (curr_tid != info->tid) { - info->tid = curr_tid; + if (saved_tid != info->tid) { /* new task, process the first vma */ op = task_vma_iter_first_vma; } else { @@ -430,9 +568,12 @@ again: return curr_vma; next_task: + if (info->common.type == BPF_TASK_ITER_TID) + goto finish; + put_task_struct(curr_task); info->task = NULL; - curr_tid++; + info->tid++; goto again; finish: @@ -533,6 +674,7 @@ static const struct bpf_iter_seq_info task_seq_info = { static struct bpf_iter_reg task_reg_info = { .target = "task", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { @@ -551,6 +693,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { @@ -571,6 +714,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = { static struct bpf_iter_reg task_vma_reg_info = { .target = "task_vma", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d6bd10759eaf..455b21a53aac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -110,6 +110,12 @@ union bpf_iter_link_info { __u32 cgroup_fd; __u64 cgroup_id; } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ From 21fb6f2aa3890b0d0abf88b7756d0098e9367a7c Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:54 -0700 Subject: [PATCH 109/143] bpf: Handle bpf_link_info for the parameterized task BPF iterators. Add new fields to bpf_link_info that users can query it through bpf_obj_get_info_by_fd(). Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-3-kuifeng@fb.com --- include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/task_iter.c | 18 ++++++++++++++++++ tools/include/uapi/linux/bpf.h | 4 ++++ 3 files changed, 26 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 455b21a53aac..3075018a4ef8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6265,6 +6265,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 order; } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; }; } iter; struct { diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8b2f47e7139d..46f836be22e2 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -672,6 +672,21 @@ static const struct bpf_iter_seq_info task_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), }; +static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info) +{ + switch (aux->task.type) { + case BPF_TASK_ITER_TID: + info->iter.task.tid = aux->task.pid; + break; + case BPF_TASK_ITER_TGID: + info->iter.task.pid = aux->task.pid; + break; + default: + break; + } + return 0; +} + static struct bpf_iter_reg task_reg_info = { .target = "task", .attach_target = bpf_iter_attach_task, @@ -682,6 +697,7 @@ static struct bpf_iter_reg task_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_seq_info, + .fill_link_info = bpf_iter_fill_link_info, }; static const struct bpf_iter_seq_info task_file_seq_info = { @@ -703,6 +719,7 @@ static struct bpf_iter_reg task_file_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_file_seq_info, + .fill_link_info = bpf_iter_fill_link_info, }; static const struct bpf_iter_seq_info task_vma_seq_info = { @@ -724,6 +741,7 @@ static struct bpf_iter_reg task_vma_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_vma_seq_info, + .fill_link_info = bpf_iter_fill_link_info, }; BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 455b21a53aac..3075018a4ef8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6265,6 +6265,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 order; } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; }; } iter; struct { From 2c4fe44fb020f3cce904da2ba9e42bb1c118e8a3 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:55 -0700 Subject: [PATCH 110/143] bpf: Handle show_fdinfo for the parameterized task BPF iterators Show information of iterators in the respective files under /proc//fdinfo/. For example, for a task file iterator with 1723 as the value of tid parameter, its fdinfo would look like the following lines. pos: 0 flags: 02000000 mnt_id: 14 ino: 38 link_type: iter link_id: 51 prog_tag: a590ac96db22b825 prog_id: 299 target_name: task_file task_type: TID tid: 1723 This patch add the last three fields. task_type is the type of the task parameter. TID means the iterator visit only the thread specified by tid. The value of tid in the above example is 1723. For the case of PID task_type, it means the iterator visits only threads of a process and will show the pid value of the process instead of a tid. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-4-kuifeng@fb.com --- kernel/bpf/task_iter.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 46f836be22e2..67e03e1833ba 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -10,6 +10,12 @@ #include #include "mmap_unlock_work.h" +static const char * const iter_task_type_names[] = { + "ALL", + "TID", + "PID", +}; + struct bpf_iter_seq_task_common { struct pid_namespace *ns; enum bpf_iter_task_type type; @@ -687,6 +693,15 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b return 0; } +static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) +{ + seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); + if (aux->task.type == BPF_TASK_ITER_TID) + seq_printf(seq, "tid:\t%u\n", aux->task.pid); + else if (aux->task.type == BPF_TASK_ITER_TGID) + seq_printf(seq, "pid:\t%u\n", aux->task.pid); +} + static struct bpf_iter_reg task_reg_info = { .target = "task", .attach_target = bpf_iter_attach_task, @@ -698,6 +713,7 @@ static struct bpf_iter_reg task_reg_info = { }, .seq_info = &task_seq_info, .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; static const struct bpf_iter_seq_info task_file_seq_info = { @@ -720,6 +736,7 @@ static struct bpf_iter_reg task_file_reg_info = { }, .seq_info = &task_file_seq_info, .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; static const struct bpf_iter_seq_info task_vma_seq_info = { @@ -742,6 +759,7 @@ static struct bpf_iter_reg task_vma_reg_info = { }, .seq_info = &task_vma_seq_info, .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, From b3e1331eb925a45df1cc5d02a725e5ea70da0e2e Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:56 -0700 Subject: [PATCH 111/143] selftests/bpf: Test parameterized task BPF iterators. Test iterators of vma, files and tasks. Ensure the API works appropriately to visit all tasks, tasks in a process, or a particular task. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-5-kuifeng@fb.com --- .../selftests/bpf/prog_tests/bpf_iter.c | 282 ++++++++++++++++-- .../selftests/bpf/prog_tests/btf_dump.c | 2 +- .../selftests/bpf/progs/bpf_iter_task.c | 9 + .../selftests/bpf/progs/bpf_iter_task_file.c | 9 +- .../selftests/bpf/progs/bpf_iter_task_vma.c | 7 +- .../selftests/bpf/progs/bpf_iter_vma_offset.c | 37 +++ 6 files changed, 322 insertions(+), 24 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index e89685bd587c..3369c5ec3a17 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ #include +#include +#include #include "bpf_iter_ipv6_route.skel.h" #include "bpf_iter_netlink.skel.h" #include "bpf_iter_bpf_map.skel.h" @@ -14,6 +16,7 @@ #include "bpf_iter_udp4.skel.h" #include "bpf_iter_udp6.skel.h" #include "bpf_iter_unix.skel.h" +#include "bpf_iter_vma_offset.skel.h" #include "bpf_iter_test_kern1.skel.h" #include "bpf_iter_test_kern2.skel.h" #include "bpf_iter_test_kern3.skel.h" @@ -43,13 +46,13 @@ static void test_btf_id_or_null(void) } } -static void do_dummy_read(struct bpf_program *prog) +static void do_dummy_read_opts(struct bpf_program *prog, struct bpf_iter_attach_opts *opts) { struct bpf_link *link; char buf[16] = {}; int iter_fd, len; - link = bpf_program__attach_iter(prog, NULL); + link = bpf_program__attach_iter(prog, opts); if (!ASSERT_OK_PTR(link, "attach_iter")) return; @@ -68,6 +71,11 @@ free_link: bpf_link__destroy(link); } +static void do_dummy_read(struct bpf_program *prog) +{ + do_dummy_read_opts(prog, NULL); +} + static void do_read_map_iter_fd(struct bpf_object_skeleton **skel, struct bpf_program *prog, struct bpf_map *map) { @@ -167,19 +175,140 @@ static void test_bpf_map(void) bpf_iter_bpf_map__destroy(skel); } -static void test_task(void) +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(SYS_pidfd_open, pid, flags); +} + +static void check_bpf_link_info(const struct bpf_program *prog) +{ + LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + struct bpf_link_info info = {}; + struct bpf_link *link; + __u32 info_len; + int err; + + memset(&linfo, 0, sizeof(linfo)); + linfo.task.tid = getpid(); + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(prog, &opts); + if (!ASSERT_OK_PTR(link, "attach_iter")) + return; + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len); + ASSERT_OK(err, "bpf_obj_get_info_by_fd"); + ASSERT_EQ(info.iter.task.tid, getpid(), "check_task_tid"); + + bpf_link__destroy(link); +} + +static pthread_mutex_t do_nothing_mutex; + +static void *do_nothing_wait(void *arg) +{ + pthread_mutex_lock(&do_nothing_mutex); + pthread_mutex_unlock(&do_nothing_mutex); + + pthread_exit(arg); +} + +static void test_task_common_nocheck(struct bpf_iter_attach_opts *opts, + int *num_unknown, int *num_known) { struct bpf_iter_task *skel; + pthread_t thread_id; + void *ret; skel = bpf_iter_task__open_and_load(); if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load")) return; - do_dummy_read(skel->progs.dump_task); + ASSERT_OK(pthread_mutex_lock(&do_nothing_mutex), "pthread_mutex_lock"); + + ASSERT_OK(pthread_create(&thread_id, NULL, &do_nothing_wait, NULL), + "pthread_create"); + + skel->bss->tid = getpid(); + + do_dummy_read_opts(skel->progs.dump_task, opts); + + *num_unknown = skel->bss->num_unknown_tid; + *num_known = skel->bss->num_known_tid; + + ASSERT_OK(pthread_mutex_unlock(&do_nothing_mutex), "pthread_mutex_unlock"); + ASSERT_FALSE(pthread_join(thread_id, &ret) || ret != NULL, + "pthread_join"); bpf_iter_task__destroy(skel); } +static void test_task_common(struct bpf_iter_attach_opts *opts, int num_unknown, int num_known) +{ + int num_unknown_tid, num_known_tid; + + test_task_common_nocheck(opts, &num_unknown_tid, &num_known_tid); + ASSERT_EQ(num_unknown_tid, num_unknown, "check_num_unknown_tid"); + ASSERT_EQ(num_known_tid, num_known, "check_num_known_tid"); +} + +static void test_task_tid(void) +{ + LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + int num_unknown_tid, num_known_tid; + + memset(&linfo, 0, sizeof(linfo)); + linfo.task.tid = getpid(); + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + test_task_common(&opts, 0, 1); + + linfo.task.tid = 0; + linfo.task.pid = getpid(); + test_task_common(&opts, 1, 1); + + test_task_common_nocheck(NULL, &num_unknown_tid, &num_known_tid); + ASSERT_GT(num_unknown_tid, 1, "check_num_unknown_tid"); + ASSERT_EQ(num_known_tid, 1, "check_num_known_tid"); +} + +static void test_task_pid(void) +{ + LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + + memset(&linfo, 0, sizeof(linfo)); + linfo.task.pid = getpid(); + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + test_task_common(&opts, 1, 1); +} + +static void test_task_pidfd(void) +{ + LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + int pidfd; + + pidfd = pidfd_open(getpid(), 0); + if (!ASSERT_GT(pidfd, 0, "pidfd_open")) + return; + + memset(&linfo, 0, sizeof(linfo)); + linfo.task.pid_fd = pidfd; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + test_task_common(&opts, 1, 1); + + close(pidfd); +} + static void test_task_sleepable(void) { struct bpf_iter_task *skel; @@ -212,14 +341,11 @@ static void test_task_stack(void) bpf_iter_task_stack__destroy(skel); } -static void *do_nothing(void *arg) -{ - pthread_exit(arg); -} - static void test_task_file(void) { + LIBBPF_OPTS(bpf_iter_attach_opts, opts); struct bpf_iter_task_file *skel; + union bpf_iter_link_info linfo; pthread_t thread_id; void *ret; @@ -229,19 +355,36 @@ static void test_task_file(void) skel->bss->tgid = getpid(); - if (!ASSERT_OK(pthread_create(&thread_id, NULL, &do_nothing, NULL), - "pthread_create")) - goto done; + ASSERT_OK(pthread_mutex_lock(&do_nothing_mutex), "pthread_mutex_lock"); + + ASSERT_OK(pthread_create(&thread_id, NULL, &do_nothing_wait, NULL), + "pthread_create"); + + memset(&linfo, 0, sizeof(linfo)); + linfo.task.tid = getpid(); + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + do_dummy_read_opts(skel->progs.dump_task_file, &opts); + + ASSERT_EQ(skel->bss->count, 0, "check_count"); + ASSERT_EQ(skel->bss->unique_tgid_count, 1, "check_unique_tgid_count"); + + skel->bss->last_tgid = 0; + skel->bss->count = 0; + skel->bss->unique_tgid_count = 0; do_dummy_read(skel->progs.dump_task_file); - if (!ASSERT_FALSE(pthread_join(thread_id, &ret) || ret != NULL, - "pthread_join")) - goto done; - ASSERT_EQ(skel->bss->count, 0, "check_count"); + ASSERT_GT(skel->bss->unique_tgid_count, 1, "check_unique_tgid_count"); + + check_bpf_link_info(skel->progs.dump_task_file); + + ASSERT_OK(pthread_mutex_unlock(&do_nothing_mutex), "pthread_mutex_unlock"); + ASSERT_OK(pthread_join(thread_id, &ret), "pthread_join"); + ASSERT_NULL(ret, "pthread_join"); -done: bpf_iter_task_file__destroy(skel); } @@ -1249,7 +1392,7 @@ static void str_strip_first_line(char *str) *dst = '\0'; } -static void test_task_vma(void) +static void test_task_vma_common(struct bpf_iter_attach_opts *opts) { int err, iter_fd = -1, proc_maps_fd = -1; struct bpf_iter_task_vma *skel; @@ -1261,13 +1404,14 @@ static void test_task_vma(void) return; skel->bss->pid = getpid(); + skel->bss->one_task = opts ? 1 : 0; err = bpf_iter_task_vma__load(skel); if (!ASSERT_OK(err, "bpf_iter_task_vma__load")) goto out; skel->links.proc_maps = bpf_program__attach_iter( - skel->progs.proc_maps, NULL); + skel->progs.proc_maps, opts); if (!ASSERT_OK_PTR(skel->links.proc_maps, "bpf_program__attach_iter")) { skel->links.proc_maps = NULL; @@ -1291,6 +1435,8 @@ static void test_task_vma(void) goto out; len += err; } + if (opts) + ASSERT_EQ(skel->bss->one_task_error, 0, "unexpected task"); /* read CMP_BUFFER_SIZE (1kB) from /proc/pid/maps */ snprintf(maps_path, 64, "/proc/%u/maps", skel->bss->pid); @@ -1306,6 +1452,9 @@ static void test_task_vma(void) str_strip_first_line(proc_maps_output); ASSERT_STREQ(task_vma_output, proc_maps_output, "compare_output"); + + check_bpf_link_info(skel->progs.proc_maps); + out: close(proc_maps_fd); close(iter_fd); @@ -1325,8 +1474,93 @@ void test_bpf_sockmap_map_iter_fd(void) bpf_iter_sockmap__destroy(skel); } +static void test_task_vma(void) +{ + LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + + memset(&linfo, 0, sizeof(linfo)); + linfo.task.tid = getpid(); + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + test_task_vma_common(&opts); + test_task_vma_common(NULL); +} + +/* uprobe attach point */ +static noinline int trigger_func(int arg) +{ + asm volatile (""); + return arg + 1; +} + +static void test_task_vma_offset_common(struct bpf_iter_attach_opts *opts, bool one_proc) +{ + struct bpf_iter_vma_offset *skel; + struct bpf_link *link; + char buf[16] = {}; + int iter_fd, len; + int pgsz, shift; + + skel = bpf_iter_vma_offset__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_vma_offset__open_and_load")) + return; + + skel->bss->pid = getpid(); + skel->bss->address = (uintptr_t)trigger_func; + for (pgsz = getpagesize(), shift = 0; pgsz > 1; pgsz >>= 1, shift++) + ; + skel->bss->page_shift = shift; + + link = bpf_program__attach_iter(skel->progs.get_vma_offset, opts); + if (!ASSERT_OK_PTR(link, "attach_iter")) + return; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_GT(iter_fd, 0, "create_iter")) + goto exit; + + while ((len = read(iter_fd, buf, sizeof(buf))) > 0) + ; + buf[15] = 0; + ASSERT_EQ(strcmp(buf, "OK\n"), 0, "strcmp"); + + ASSERT_EQ(skel->bss->offset, get_uprobe_offset(trigger_func), "offset"); + if (one_proc) + ASSERT_EQ(skel->bss->unique_tgid_cnt, 1, "unique_tgid_count"); + else + ASSERT_GT(skel->bss->unique_tgid_cnt, 1, "unique_tgid_count"); + + close(iter_fd); + +exit: + bpf_link__destroy(link); +} + +static void test_task_vma_offset(void) +{ + LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + + memset(&linfo, 0, sizeof(linfo)); + linfo.task.pid = getpid(); + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + test_task_vma_offset_common(&opts, true); + + linfo.task.pid = 0; + linfo.task.tid = getpid(); + test_task_vma_offset_common(&opts, true); + + test_task_vma_offset_common(NULL, false); +} + void test_bpf_iter(void) { + ASSERT_OK(pthread_mutex_init(&do_nothing_mutex, NULL), "pthread_mutex_init"); + if (test__start_subtest("btf_id_or_null")) test_btf_id_or_null(); if (test__start_subtest("ipv6_route")) @@ -1335,8 +1569,12 @@ void test_bpf_iter(void) test_netlink(); if (test__start_subtest("bpf_map")) test_bpf_map(); - if (test__start_subtest("task")) - test_task(); + if (test__start_subtest("task_tid")) + test_task_tid(); + if (test__start_subtest("task_pid")) + test_task_pid(); + if (test__start_subtest("task_pidfd")) + test_task_pidfd(); if (test__start_subtest("task_sleepable")) test_task_sleepable(); if (test__start_subtest("task_stack")) @@ -1397,4 +1635,6 @@ void test_bpf_iter(void) test_ksym_iter(); if (test__start_subtest("bpf_sockmap_map_iter_fd")) test_bpf_sockmap_map_iter_fd(); + if (test__start_subtest("vma_offset")) + test_task_vma_offset(); } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index b1ca954ed1e5..24da335482d4 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -764,7 +764,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, /* union with nested struct */ TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT, - "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_CGROUP_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}", + "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_CGROUP_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},.task = (struct){.tid = (__u32)1,.pid = (__u32)1,},}", { .cgroup = { .order = 1, .cgroup_fd = 1, }}); /* struct skb with nested structs/unions; because type output is so diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c index d22741272692..96131b9a1caa 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c @@ -6,6 +6,10 @@ char _license[] SEC("license") = "GPL"; +uint32_t tid = 0; +int num_unknown_tid = 0; +int num_known_tid = 0; + SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { @@ -18,6 +22,11 @@ int dump_task(struct bpf_iter__task *ctx) return 0; } + if (task->pid != tid) + num_unknown_tid++; + else + num_known_tid++; + if (ctx->meta->seq_num == 0) BPF_SEQ_PRINTF(seq, " tgid gid\n"); diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c index 6e7b400888fe..b0255080662d 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c @@ -7,14 +7,16 @@ char _license[] SEC("license") = "GPL"; int count = 0; int tgid = 0; +int last_tgid = 0; +int unique_tgid_count = 0; SEC("iter/task_file") int dump_task_file(struct bpf_iter__task_file *ctx) { struct seq_file *seq = ctx->meta->seq; struct task_struct *task = ctx->task; - __u32 fd = ctx->fd; struct file *file = ctx->file; + __u32 fd = ctx->fd; if (task == (void *)0 || file == (void *)0) return 0; @@ -27,6 +29,11 @@ int dump_task_file(struct bpf_iter__task_file *ctx) if (tgid == task->tgid && task->tgid != task->pid) count++; + if (last_tgid != task->tgid) { + last_tgid = task->tgid; + unique_tgid_count++; + } + BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd, (long)file->f_op); return 0; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c index 4ea6a37d1345..dd923dc637d5 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c @@ -20,6 +20,8 @@ char _license[] SEC("license") = "GPL"; #define D_PATH_BUF_SIZE 1024 char d_path_buf[D_PATH_BUF_SIZE] = {}; __u32 pid = 0; +__u32 one_task = 0; +__u32 one_task_error = 0; SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx) { @@ -33,8 +35,11 @@ SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx) return 0; file = vma->vm_file; - if (task->tgid != pid) + if (task->tgid != pid) { + if (one_task) + one_task_error = 1; return 0; + } perm_str[0] = (vma->vm_flags & VM_READ) ? 'r' : '-'; perm_str[1] = (vma->vm_flags & VM_WRITE) ? 'w' : '-'; perm_str[2] = (vma->vm_flags & VM_EXEC) ? 'x' : '-'; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c b/tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c new file mode 100644 index 000000000000..ee7455d2623a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include "bpf_iter.h" +#include + +char _license[] SEC("license") = "GPL"; + +__u32 unique_tgid_cnt = 0; +uintptr_t address = 0; +uintptr_t offset = 0; +__u32 last_tgid = 0; +__u32 pid = 0; +__u32 page_shift = 0; + +SEC("iter/task_vma") +int get_vma_offset(struct bpf_iter__task_vma *ctx) +{ + struct vm_area_struct *vma = ctx->vma; + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + + if (task == NULL || vma == NULL) + return 0; + + if (last_tgid != task->tgid) + unique_tgid_cnt++; + last_tgid = task->tgid; + + if (task->tgid != pid) + return 0; + + if (vma->vm_start <= address && vma->vm_end > address) { + offset = address - vma->vm_start + (vma->vm_pgoff << page_shift); + BPF_SEQ_PRINTF(seq, "OK\n"); + } + return 0; +} From 6bdb6d6be019f697296f52c37865dd7b0ce80750 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:57 -0700 Subject: [PATCH 112/143] bpftool: Show parameters of BPF task iterators. Show tid or pid of iterators if giving an argument of tid or pid For example, the command `bpftool link list` may list following lines. 1: iter prog 2 target_name bpf_map 2: iter prog 3 target_name bpf_prog 33: iter prog 225 target_name task_file tid 1644 pids test_progs(1644) Link 33 is a task_file iterator with tid 1644. For now, only targets of task, task_file and task_vma may be with tid or pid to filter out tasks other than those belonging to a process (pid) or a thread (tid). Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-6-kuifeng@fb.com --- tools/bpf/bpftool/link.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index ef0dc2f8d5a2..2863639706dd 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -106,6 +106,13 @@ static const char *cgroup_order_string(__u32 order) } } +static bool is_iter_task_target(const char *target_name) +{ + return strcmp(target_name, "task") == 0 || + strcmp(target_name, "task_file") == 0 || + strcmp(target_name, "task_vma") == 0; +} + static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr) { const char *target_name = u64_to_ptr(info->iter.target_name); @@ -114,6 +121,12 @@ static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr) if (is_iter_map_target(target_name)) jsonw_uint_field(wtr, "map_id", info->iter.map.map_id); + else if (is_iter_task_target(target_name)) { + if (info->iter.task.tid) + jsonw_uint_field(wtr, "tid", info->iter.task.tid); + else if (info->iter.task.pid) + jsonw_uint_field(wtr, "pid", info->iter.task.pid); + } if (is_iter_cgroup_target(target_name)) { jsonw_lluint_field(wtr, "cgroup_id", info->iter.cgroup.cgroup_id); @@ -237,6 +250,12 @@ static void show_iter_plain(struct bpf_link_info *info) if (is_iter_map_target(target_name)) printf("map_id %u ", info->iter.map.map_id); + else if (is_iter_task_target(target_name)) { + if (info->iter.task.tid) + printf("tid %u ", info->iter.task.tid); + else if (info->iter.task.pid) + printf("pid %u ", info->iter.task.pid); + } if (is_iter_cgroup_target(target_name)) { printf("cgroup_id %llu ", info->iter.cgroup.cgroup_id); From 38e35e1d0cee3432baadfd6900e1d05a3419eda6 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:01 +0800 Subject: [PATCH 113/143] selftests/bpf: Convert sockmap_basic test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-2-git-send-email-wangyufen@huawei.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 87 +++++++------------ 1 file changed, 33 insertions(+), 54 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index cec5c0882372..0aa088900699 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -27,21 +27,21 @@ static int connected_socket_v4(void) int s, repair, err; s = socket(AF_INET, SOCK_STREAM, 0); - if (CHECK_FAIL(s == -1)) + if (!ASSERT_GE(s, 0, "socket")) goto error; repair = TCP_REPAIR_ON; err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair)); - if (CHECK_FAIL(err)) + if (!ASSERT_OK(err, "setsockopt(TCP_REPAIR)")) goto error; err = connect(s, (struct sockaddr *)&addr, len); - if (CHECK_FAIL(err)) + if (!ASSERT_OK(err, "connect")) goto error; repair = TCP_REPAIR_OFF_NO_WP; err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair)); - if (CHECK_FAIL(err)) + if (!ASSERT_OK(err, "setsockopt(TCP_REPAIR)")) goto error; return s; @@ -54,7 +54,7 @@ error: static void compare_cookies(struct bpf_map *src, struct bpf_map *dst) { __u32 i, max_entries = bpf_map__max_entries(src); - int err, duration = 0, src_fd, dst_fd; + int err, src_fd, dst_fd; src_fd = bpf_map__fd(src); dst_fd = bpf_map__fd(dst); @@ -65,20 +65,18 @@ static void compare_cookies(struct bpf_map *src, struct bpf_map *dst) err = bpf_map_lookup_elem(src_fd, &i, &src_cookie); if (err && errno == ENOENT) { err = bpf_map_lookup_elem(dst_fd, &i, &dst_cookie); - CHECK(!err, "map_lookup_elem(dst)", "element %u not deleted\n", i); - CHECK(err && errno != ENOENT, "map_lookup_elem(dst)", "%s\n", - strerror(errno)); + ASSERT_ERR(err, "map_lookup_elem(dst)"); + ASSERT_EQ(errno, ENOENT, "map_lookup_elem(dst)"); continue; } - if (CHECK(err, "lookup_elem(src)", "%s\n", strerror(errno))) + if (!ASSERT_OK(err, "lookup_elem(src)")) continue; err = bpf_map_lookup_elem(dst_fd, &i, &dst_cookie); - if (CHECK(err, "lookup_elem(dst)", "%s\n", strerror(errno))) + if (!ASSERT_OK(err, "lookup_elem(dst)")) continue; - CHECK(dst_cookie != src_cookie, "cookie mismatch", - "%llu != %llu (pos %u)\n", dst_cookie, src_cookie, i); + ASSERT_EQ(dst_cookie, src_cookie, "cookie mismatch"); } } @@ -89,20 +87,16 @@ static void test_sockmap_create_update_free(enum bpf_map_type map_type) int s, map, err; s = connected_socket_v4(); - if (CHECK_FAIL(s < 0)) + if (!ASSERT_GE(s, 0, "connected_socket_v4")) return; map = bpf_map_create(map_type, NULL, sizeof(int), sizeof(int), 1, NULL); - if (CHECK_FAIL(map < 0)) { - perror("bpf_cmap_create"); + if (!ASSERT_GE(map, 0, "bpf_map_create")) goto out; - } err = bpf_map_update_elem(map, &zero, &s, BPF_NOEXIST); - if (CHECK_FAIL(err)) { - perror("bpf_map_update"); + if (!ASSERT_OK(err, "bpf_map_update")) goto out; - } out: close(map); @@ -115,32 +109,26 @@ static void test_skmsg_helpers(enum bpf_map_type map_type) int err, map, verdict; skel = test_skmsg_load_helpers__open_and_load(); - if (CHECK_FAIL(!skel)) { - perror("test_skmsg_load_helpers__open_and_load"); + if (!ASSERT_OK_PTR(skel, "test_skmsg_load_helpers__open_and_load")) return; - } verdict = bpf_program__fd(skel->progs.prog_msg_verdict); map = bpf_map__fd(skel->maps.sock_map); err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); - if (CHECK_FAIL(err)) { - perror("bpf_prog_attach"); + if (!ASSERT_OK(err, "bpf_prog_attach")) goto out; - } err = bpf_prog_detach2(verdict, map, BPF_SK_MSG_VERDICT); - if (CHECK_FAIL(err)) { - perror("bpf_prog_detach2"); + if (!ASSERT_OK(err, "bpf_prog_detach2")) goto out; - } out: test_skmsg_load_helpers__destroy(skel); } static void test_sockmap_update(enum bpf_map_type map_type) { - int err, prog, src, duration = 0; + int err, prog, src; struct test_sockmap_update *skel; struct bpf_map *dst_map; const __u32 zero = 0; @@ -153,11 +141,11 @@ static void test_sockmap_update(enum bpf_map_type map_type) __s64 sk; sk = connected_socket_v4(); - if (CHECK(sk == -1, "connected_socket_v4", "cannot connect\n")) + if (!ASSERT_NEQ(sk, -1, "connected_socket_v4")) return; skel = test_sockmap_update__open_and_load(); - if (CHECK(!skel, "open_and_load", "cannot load skeleton\n")) + if (!ASSERT_OK_PTR(skel, "open_and_load")) goto close_sk; prog = bpf_program__fd(skel->progs.copy_sock_map); @@ -168,7 +156,7 @@ static void test_sockmap_update(enum bpf_map_type map_type) dst_map = skel->maps.dst_sock_hash; err = bpf_map_update_elem(src, &zero, &sk, BPF_NOEXIST); - if (CHECK(err, "update_elem(src)", "errno=%u\n", errno)) + if (!ASSERT_OK(err, "update_elem(src)")) goto out; err = bpf_prog_test_run_opts(prog, &topts); @@ -188,17 +176,16 @@ close_sk: static void test_sockmap_invalid_update(void) { struct test_sockmap_invalid_update *skel; - int duration = 0; skel = test_sockmap_invalid_update__open_and_load(); - if (CHECK(skel, "open_and_load", "verifier accepted map_update\n")) + if (!ASSERT_NULL(skel, "open_and_load")) test_sockmap_invalid_update__destroy(skel); } static void test_sockmap_copy(enum bpf_map_type map_type) { DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); - int err, len, src_fd, iter_fd, duration = 0; + int err, len, src_fd, iter_fd; union bpf_iter_link_info linfo = {}; __u32 i, num_sockets, num_elems; struct bpf_iter_sockmap *skel; @@ -208,7 +195,7 @@ static void test_sockmap_copy(enum bpf_map_type map_type) char buf[64]; skel = bpf_iter_sockmap__open_and_load(); - if (CHECK(!skel, "bpf_iter_sockmap__open_and_load", "skeleton open_and_load failed\n")) + if (!ASSERT_OK_PTR(skel, "bpf_iter_sockmap__open_and_load")) return; if (map_type == BPF_MAP_TYPE_SOCKMAP) { @@ -222,7 +209,7 @@ static void test_sockmap_copy(enum bpf_map_type map_type) } sock_fd = calloc(num_sockets, sizeof(*sock_fd)); - if (CHECK(!sock_fd, "calloc(sock_fd)", "failed to allocate\n")) + if (!ASSERT_OK_PTR(sock_fd, "calloc(sock_fd)")) goto out; for (i = 0; i < num_sockets; i++) @@ -232,11 +219,11 @@ static void test_sockmap_copy(enum bpf_map_type map_type) for (i = 0; i < num_sockets; i++) { sock_fd[i] = connected_socket_v4(); - if (CHECK(sock_fd[i] == -1, "connected_socket_v4", "cannot connect\n")) + if (!ASSERT_NEQ(sock_fd[i], -1, "connected_socket_v4")) goto out; err = bpf_map_update_elem(src_fd, &i, &sock_fd[i], BPF_NOEXIST); - if (CHECK(err, "map_update", "failed: %s\n", strerror(errno))) + if (!ASSERT_OK(err, "map_update")) goto out; } @@ -248,22 +235,20 @@ static void test_sockmap_copy(enum bpf_map_type map_type) goto out; iter_fd = bpf_iter_create(bpf_link__fd(link)); - if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + if (!ASSERT_GE(iter_fd, 0, "create_iter")) goto free_link; /* do some tests */ while ((len = read(iter_fd, buf, sizeof(buf))) > 0) ; - if (CHECK(len < 0, "read", "failed: %s\n", strerror(errno))) + if (!ASSERT_GE(len, 0, "read")) goto close_iter; /* test results */ - if (CHECK(skel->bss->elems != num_elems, "elems", "got %u expected %u\n", - skel->bss->elems, num_elems)) + if (!ASSERT_EQ(skel->bss->elems, num_elems, "elems")) goto close_iter; - if (CHECK(skel->bss->socks != num_sockets, "socks", "got %u expected %u\n", - skel->bss->socks, num_sockets)) + if (!ASSERT_EQ(skel->bss->socks, num_sockets, "socks")) goto close_iter; compare_cookies(src, skel->maps.dst); @@ -288,28 +273,22 @@ static void test_sockmap_skb_verdict_attach(enum bpf_attach_type first, int err, map, verdict; skel = test_sockmap_skb_verdict_attach__open_and_load(); - if (CHECK_FAIL(!skel)) { - perror("test_sockmap_skb_verdict_attach__open_and_load"); + if (!ASSERT_OK_PTR(skel, "open_and_load")) return; - } verdict = bpf_program__fd(skel->progs.prog_skb_verdict); map = bpf_map__fd(skel->maps.sock_map); err = bpf_prog_attach(verdict, map, first, 0); - if (CHECK_FAIL(err)) { - perror("bpf_prog_attach"); + if (!ASSERT_OK(err, "bpf_prog_attach")) goto out; - } err = bpf_prog_attach(verdict, map, second, 0); ASSERT_EQ(err, -EBUSY, "prog_attach_fail"); err = bpf_prog_detach2(verdict, map, first); - if (CHECK_FAIL(err)) { - perror("bpf_prog_detach2"); + if (!ASSERT_OK(err, "bpf_prog_detach2")) goto out; - } out: test_sockmap_skb_verdict_attach__destroy(skel); } From d155fcb3fff16410ccd7583f9a16c15ddffeca1e Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:02 +0800 Subject: [PATCH 114/143] selftests/bpf: Convert sockmap_ktls test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-3-git-send-email-wangyufen@huawei.com --- .../selftests/bpf/prog_tests/sockmap_ktls.c | 39 +++++-------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c index e172d89e92e1..2d0796314862 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c @@ -15,16 +15,12 @@ static int tcp_server(int family) int err, s; s = socket(family, SOCK_STREAM, 0); - if (CHECK_FAIL(s == -1)) { - perror("socket"); + if (!ASSERT_GE(s, 0, "socket")) return -1; - } err = listen(s, SOMAXCONN); - if (CHECK_FAIL(err)) { - perror("listen"); + if (!ASSERT_OK(err, "listen")) return -1; - } return s; } @@ -48,44 +44,31 @@ static void test_sockmap_ktls_disconnect_after_delete(int family, int map) return; err = getsockname(srv, (struct sockaddr *)&addr, &len); - if (CHECK_FAIL(err)) { - perror("getsockopt"); + if (!ASSERT_OK(err, "getsockopt")) goto close_srv; - } cli = socket(family, SOCK_STREAM, 0); - if (CHECK_FAIL(cli == -1)) { - perror("socket"); + if (!ASSERT_GE(cli, 0, "socket")) goto close_srv; - } err = connect(cli, (struct sockaddr *)&addr, len); - if (CHECK_FAIL(err)) { - perror("connect"); + if (!ASSERT_OK(err, "connect")) goto close_cli; - } err = bpf_map_update_elem(map, &zero, &cli, 0); - if (CHECK_FAIL(err)) { - perror("bpf_map_update_elem"); + if (!ASSERT_OK(err, "bpf_map_update_elem")) goto close_cli; - } err = setsockopt(cli, IPPROTO_TCP, TCP_ULP, "tls", strlen("tls")); - if (CHECK_FAIL(err)) { - perror("setsockopt(TCP_ULP)"); + if (!ASSERT_OK(err, "setsockopt(TCP_ULP)")) goto close_cli; - } err = bpf_map_delete_elem(map, &zero); - if (CHECK_FAIL(err)) { - perror("bpf_map_delete_elem"); + if (!ASSERT_OK(err, "bpf_map_delete_elem")) goto close_cli; - } err = disconnect(cli); - if (CHECK_FAIL(err)) - perror("disconnect"); + ASSERT_OK(err, "disconnect"); close_cli: close(cli); @@ -168,10 +151,8 @@ static void run_tests(int family, enum bpf_map_type map_type) int map; map = bpf_map_create(map_type, NULL, sizeof(int), sizeof(int), 1, NULL); - if (CHECK_FAIL(map < 0)) { - perror("bpf_map_create"); + if (!ASSERT_GE(map, 0, "bpf_map_create")) return; - } if (test__start_subtest(fmt_test_name("disconnect_after_delete", family, map_type))) test_sockmap_ktls_disconnect_after_delete(family, map); From 099763e7da0beec120827547f227c123e9d4a155 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:03 +0800 Subject: [PATCH 115/143] selftests/bpf: Convert sockopt test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-4-git-send-email-wangyufen@huawei.com --- tools/testing/selftests/bpf/prog_tests/sockopt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c index cd09f4c7dd92..aa4debf62fc6 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c @@ -972,12 +972,12 @@ void test_sockopt(void) int cgroup_fd, i; cgroup_fd = test__join_cgroup("/sockopt"); - if (CHECK_FAIL(cgroup_fd < 0)) + if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup")) return; for (i = 0; i < ARRAY_SIZE(tests); i++) { test__start_subtest(tests[i].descr); - CHECK_FAIL(run_test(cgroup_fd, &tests[i])); + ASSERT_OK(run_test(cgroup_fd, &tests[i]), tests[i].descr); } close(cgroup_fd); From 675bc8abe16d9ce97970e8a781e9e72bb8d47ca2 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:04 +0800 Subject: [PATCH 116/143] selftests/bpf: Convert sockopt_inherit test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-5-git-send-email-wangyufen@huawei.com --- .../bpf/prog_tests/sockopt_inherit.c | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index c5cb6e8374b6..60c17a8e2789 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -76,20 +76,16 @@ static void *server_thread(void *arg) pthread_cond_signal(&server_started); pthread_mutex_unlock(&server_started_mtx); - if (CHECK_FAIL(err < 0)) { - perror("Failed to listed on socket"); + if (!ASSERT_GE(err, 0, "listed on socket")) return NULL; - } err += verify_sockopt(fd, CUSTOM_INHERIT1, "listen", 1); err += verify_sockopt(fd, CUSTOM_INHERIT2, "listen", 1); err += verify_sockopt(fd, CUSTOM_LISTENER, "listen", 1); client_fd = accept(fd, (struct sockaddr *)&addr, &len); - if (CHECK_FAIL(client_fd < 0)) { - perror("Failed to accept client"); + if (!ASSERT_GE(client_fd, 0, "accept client")) return NULL; - } err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "accept", 1); err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "accept", 1); @@ -183,20 +179,20 @@ static void run_test(int cgroup_fd) goto close_bpf_object; err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt", "_getsockopt"); - if (CHECK_FAIL(err)) + if (!ASSERT_OK(err, "prog_attach _getsockopt")) goto close_bpf_object; err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt", "_setsockopt"); - if (CHECK_FAIL(err)) + if (!ASSERT_OK(err, "prog_attach _setsockopt")) goto close_bpf_object; server_fd = start_server(); - if (CHECK_FAIL(server_fd < 0)) + if (!ASSERT_GE(server_fd, 0, "start_server")) goto close_bpf_object; pthread_mutex_lock(&server_started_mtx); - if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread, - (void *)&server_fd))) { + if (!ASSERT_OK(pthread_create(&tid, NULL, server_thread, + (void *)&server_fd), "pthread_create")) { pthread_mutex_unlock(&server_started_mtx); goto close_server_fd; } @@ -204,17 +200,17 @@ static void run_test(int cgroup_fd) pthread_mutex_unlock(&server_started_mtx); client_fd = connect_to_server(server_fd); - if (CHECK_FAIL(client_fd < 0)) + if (!ASSERT_GE(client_fd, 0, "connect_to_server")) goto close_server_fd; - CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0)); - CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0)); - CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0)); + ASSERT_OK(verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0), "verify_sockopt1"); + ASSERT_OK(verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0), "verify_sockopt2"); + ASSERT_OK(verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0), "verify_sockopt ener"); pthread_join(tid, &server_err); err = (int)(long)server_err; - CHECK_FAIL(err); + ASSERT_OK(err, "pthread_join retval"); close(client_fd); @@ -229,7 +225,7 @@ void test_sockopt_inherit(void) int cgroup_fd; cgroup_fd = test__join_cgroup("/sockopt_inherit"); - if (CHECK_FAIL(cgroup_fd < 0)) + if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup")) return; run_test(cgroup_fd); From a605a6bbccceebbb68ab9f8ff2b27e2faa38525d Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:05 +0800 Subject: [PATCH 117/143] selftests/bpf: Convert sockopt_multi test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-6-git-send-email-wangyufen@huawei.com --- tools/testing/selftests/bpf/prog_tests/sockopt_multi.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c index 28d592dc54a7..7f5659349011 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c @@ -303,11 +303,11 @@ void test_sockopt_multi(void) int err = -1; cg_parent = test__join_cgroup("/parent"); - if (CHECK_FAIL(cg_parent < 0)) + if (!ASSERT_GE(cg_parent, 0, "join_cgroup /parent")) goto out; cg_child = test__join_cgroup("/parent/child"); - if (CHECK_FAIL(cg_child < 0)) + if (!ASSERT_GE(cg_child, 0, "join_cgroup /parent/child")) goto out; obj = bpf_object__open_file("sockopt_multi.bpf.o", NULL); @@ -319,11 +319,11 @@ void test_sockopt_multi(void) goto out; sock_fd = socket(AF_INET, SOCK_STREAM, 0); - if (CHECK_FAIL(sock_fd < 0)) + if (!ASSERT_GE(sock_fd, 0, "socket")) goto out; - CHECK_FAIL(run_getsockopt_test(obj, cg_parent, cg_child, sock_fd)); - CHECK_FAIL(run_setsockopt_test(obj, cg_parent, cg_child, sock_fd)); + ASSERT_OK(run_getsockopt_test(obj, cg_parent, cg_child, sock_fd), "getsockopt_test"); + ASSERT_OK(run_setsockopt_test(obj, cg_parent, cg_child, sock_fd), "setsockopt_test"); out: close(sock_fd); From f19708dfa0bf5a016f27e92ef4d3514788f6dc8b Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:06 +0800 Subject: [PATCH 118/143] selftests/bpf: Convert sockopt_sk test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-7-git-send-email-wangyufen@huawei.com --- tools/testing/selftests/bpf/prog_tests/sockopt_sk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index 30a99d2ed5c6..60d952719d27 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -223,7 +223,7 @@ void test_sockopt_sk(void) int cgroup_fd; cgroup_fd = test__join_cgroup("/sockopt_sk"); - if (CHECK_FAIL(cgroup_fd < 0)) + if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /sockopt_sk")) return; run_test(cgroup_fd); From a0a17296713aea7b3cbc94662c6ffb53a79a3f2c Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:07 +0800 Subject: [PATCH 119/143] selftests/bpf: Convert tcp_estats test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-8-git-send-email-wangyufen@huawei.com --- tools/testing/selftests/bpf/prog_tests/tcp_estats.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c index 032dbfb26256..e070bca2b764 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c @@ -6,11 +6,9 @@ void test_tcp_estats(void) const char *file = "./test_tcp_estats.bpf.o"; int err, prog_fd; struct bpf_object *obj; - __u32 duration = 0; err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); - CHECK(err, "", "err %d errno %d\n", err, errno); - if (err) + if (!ASSERT_OK(err, "")) return; bpf_object__close(obj); From 3082f8cd4ba32091be82c19c357ddfd300c5a433 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:08 +0800 Subject: [PATCH 120/143] selftests/bpf: Convert tcp_hdr_options test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-9-git-send-email-wangyufen@huawei.com --- .../bpf/prog_tests/tcp_hdr_options.c | 80 +++++++------------ 1 file changed, 28 insertions(+), 52 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c index f24436d33cd6..617bbce6ef8f 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c @@ -42,13 +42,10 @@ struct sk_fds { static int create_netns(void) { - if (CHECK(unshare(CLONE_NEWNET), "create netns", - "unshare(CLONE_NEWNET): %s (%d)", - strerror(errno), errno)) + if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns")) return -1; - if (CHECK(system("ip link set dev lo up"), "run ip cmd", - "failed to bring lo link up\n")) + if (!ASSERT_OK(system("ip link set dev lo up"), "run ip cmd")) return -1; return 0; @@ -80,16 +77,12 @@ static int sk_fds_shutdown(struct sk_fds *sk_fds) shutdown(sk_fds->active_fd, SHUT_WR); ret = read(sk_fds->passive_fd, &abyte, sizeof(abyte)); - if (CHECK(ret != 0, "read-after-shutdown(passive_fd):", - "ret:%d %s (%d)\n", - ret, strerror(errno), errno)) + if (!ASSERT_EQ(ret, 0, "read-after-shutdown(passive_fd):")) return -1; shutdown(sk_fds->passive_fd, SHUT_WR); ret = read(sk_fds->active_fd, &abyte, sizeof(abyte)); - if (CHECK(ret != 0, "read-after-shutdown(active_fd):", - "ret:%d %s (%d)\n", - ret, strerror(errno), errno)) + if (!ASSERT_EQ(ret, 0, "read-after-shutdown(active_fd):")) return -1; return 0; @@ -102,8 +95,7 @@ static int sk_fds_connect(struct sk_fds *sk_fds, bool fast_open) socklen_t len; sk_fds->srv_fd = start_server(AF_INET6, SOCK_STREAM, LO_ADDR6, 0, 0); - if (CHECK(sk_fds->srv_fd == -1, "start_server", "%s (%d)\n", - strerror(errno), errno)) + if (!ASSERT_NEQ(sk_fds->srv_fd, -1, "start_server")) goto error; if (fast_open) @@ -112,28 +104,25 @@ static int sk_fds_connect(struct sk_fds *sk_fds, bool fast_open) else sk_fds->active_fd = connect_to_fd(sk_fds->srv_fd, 0); - if (CHECK_FAIL(sk_fds->active_fd == -1)) { + if (!ASSERT_NEQ(sk_fds->active_fd, -1, "")) { close(sk_fds->srv_fd); goto error; } len = sizeof(addr6); - if (CHECK(getsockname(sk_fds->srv_fd, (struct sockaddr *)&addr6, - &len), "getsockname(srv_fd)", "%s (%d)\n", - strerror(errno), errno)) + if (!ASSERT_OK(getsockname(sk_fds->srv_fd, (struct sockaddr *)&addr6, + &len), "getsockname(srv_fd)")) goto error_close; sk_fds->passive_lport = ntohs(addr6.sin6_port); len = sizeof(addr6); - if (CHECK(getsockname(sk_fds->active_fd, (struct sockaddr *)&addr6, - &len), "getsockname(active_fd)", "%s (%d)\n", - strerror(errno), errno)) + if (!ASSERT_OK(getsockname(sk_fds->active_fd, (struct sockaddr *)&addr6, + &len), "getsockname(active_fd)")) goto error_close; sk_fds->active_lport = ntohs(addr6.sin6_port); sk_fds->passive_fd = accept(sk_fds->srv_fd, NULL, 0); - if (CHECK(sk_fds->passive_fd == -1, "accept(srv_fd)", "%s (%d)\n", - strerror(errno), errno)) + if (!ASSERT_NEQ(sk_fds->passive_fd, -1, "accept(srv_fd)")) goto error_close; if (fast_open) { @@ -141,8 +130,7 @@ static int sk_fds_connect(struct sk_fds *sk_fds, bool fast_open) int ret; ret = read(sk_fds->passive_fd, bytes_in, sizeof(bytes_in)); - if (CHECK(ret != sizeof(fast), "read fastopen syn data", - "expected=%lu actual=%d\n", sizeof(fast), ret)) { + if (!ASSERT_EQ(ret, sizeof(fast), "read fastopen syn data")) { close(sk_fds->passive_fd); goto error_close; } @@ -163,8 +151,7 @@ static int check_hdr_opt(const struct bpf_test_option *exp, const struct bpf_test_option *act, const char *hdr_desc) { - if (CHECK(memcmp(exp, act, sizeof(*exp)), - "expected-vs-actual", "unexpected %s\n", hdr_desc)) { + if (!ASSERT_OK(memcmp(exp, act, sizeof(*exp)), hdr_desc)) { print_option(exp, "expected: "); print_option(act, " actual: "); return -1; @@ -178,13 +165,11 @@ static int check_hdr_stg(const struct hdr_stg *exp, int fd, { struct hdr_stg act; - if (CHECK(bpf_map_lookup_elem(hdr_stg_map_fd, &fd, &act), - "map_lookup(hdr_stg_map_fd)", "%s %s (%d)\n", - stg_desc, strerror(errno), errno)) + if (!ASSERT_OK(bpf_map_lookup_elem(hdr_stg_map_fd, &fd, &act), + "map_lookup(hdr_stg_map_fd)")) return -1; - if (CHECK(memcmp(exp, &act, sizeof(*exp)), - "expected-vs-actual", "unexpected %s\n", stg_desc)) { + if (!ASSERT_OK(memcmp(exp, &act, sizeof(*exp)), stg_desc)) { print_hdr_stg(exp, "expected: "); print_hdr_stg(&act, " actual: "); return -1; @@ -228,9 +213,8 @@ static void check_hdr_and_close_fds(struct sk_fds *sk_fds) if (sk_fds_shutdown(sk_fds)) goto check_linum; - if (CHECK(expected_inherit_cb_flags != skel->bss->inherit_cb_flags, - "Unexpected inherit_cb_flags", "0x%x != 0x%x\n", - skel->bss->inherit_cb_flags, expected_inherit_cb_flags)) + if (!ASSERT_EQ(expected_inherit_cb_flags, skel->bss->inherit_cb_flags, + "inherit_cb_flags")) goto check_linum; if (check_hdr_stg(&exp_passive_hdr_stg, sk_fds->passive_fd, @@ -257,7 +241,7 @@ static void check_hdr_and_close_fds(struct sk_fds *sk_fds) "active_fin_in"); check_linum: - CHECK_FAIL(check_error_linum(sk_fds)); + ASSERT_FALSE(check_error_linum(sk_fds), "check_error_linum"); sk_fds_close(sk_fds); } @@ -497,26 +481,20 @@ static void misc(void) /* MSG_EOR to ensure skb will not be combined */ ret = send(sk_fds.active_fd, send_msg, sizeof(send_msg), MSG_EOR); - if (CHECK(ret != sizeof(send_msg), "send(msg)", "ret:%d\n", - ret)) + if (!ASSERT_EQ(ret, sizeof(send_msg), "send(msg)")) goto check_linum; ret = read(sk_fds.passive_fd, recv_msg, sizeof(recv_msg)); - if (CHECK(ret != sizeof(send_msg), "read(msg)", "ret:%d\n", - ret)) + if (ASSERT_EQ(ret, sizeof(send_msg), "read(msg)")) goto check_linum; } if (sk_fds_shutdown(&sk_fds)) goto check_linum; - CHECK(misc_skel->bss->nr_syn != 1, "unexpected nr_syn", - "expected (1) != actual (%u)\n", - misc_skel->bss->nr_syn); + ASSERT_EQ(misc_skel->bss->nr_syn, 1, "unexpected nr_syn"); - CHECK(misc_skel->bss->nr_data != nr_data, "unexpected nr_data", - "expected (%u) != actual (%u)\n", - nr_data, misc_skel->bss->nr_data); + ASSERT_EQ(misc_skel->bss->nr_data, nr_data, "unexpected nr_data"); /* The last ACK may have been delayed, so it is either 1 or 2. */ CHECK(misc_skel->bss->nr_pure_ack != 1 && @@ -525,12 +503,10 @@ static void misc(void) "expected (1 or 2) != actual (%u)\n", misc_skel->bss->nr_pure_ack); - CHECK(misc_skel->bss->nr_fin != 1, "unexpected nr_fin", - "expected (1) != actual (%u)\n", - misc_skel->bss->nr_fin); + ASSERT_EQ(misc_skel->bss->nr_fin, 1, "unexpected nr_fin"); check_linum: - CHECK_FAIL(check_error_linum(&sk_fds)); + ASSERT_FALSE(check_error_linum(&sk_fds), "check_error_linum"); sk_fds_close(&sk_fds); bpf_link__destroy(link); } @@ -555,15 +531,15 @@ void test_tcp_hdr_options(void) int i; skel = test_tcp_hdr_options__open_and_load(); - if (CHECK(!skel, "open and load skel", "failed")) + if (!ASSERT_OK_PTR(skel, "open and load skel")) return; misc_skel = test_misc_tcp_hdr_options__open_and_load(); - if (CHECK(!misc_skel, "open and load misc test skel", "failed")) + if (!ASSERT_OK_PTR(misc_skel, "open and load misc test skel")) goto skel_destroy; cg_fd = test__join_cgroup(CG_NAME); - if (CHECK_FAIL(cg_fd < 0)) + if (ASSERT_GE(cg_fd, 0, "join_cgroup")) goto skel_destroy; for (i = 0; i < ARRAY_SIZE(tests); i++) { From 8dda32ac58b622f4c1ec0edd4f6b12f84170fe01 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:09 +0800 Subject: [PATCH 121/143] selftests/bpf: Convert tcp_rtt test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-10-git-send-email-wangyufen@huawei.com --- tools/testing/selftests/bpf/prog_tests/tcp_rtt.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index 96ff2c20af81..8fe84da1b9b4 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -16,8 +16,7 @@ static void send_byte(int fd) { char b = 0x55; - if (CHECK_FAIL(write(fd, &b, sizeof(b)) != 1)) - perror("Failed to send single byte"); + ASSERT_EQ(write(fd, &b, sizeof(b)), 1, "send single byte"); } static int wait_for_ack(int fd, int retries) @@ -51,10 +50,8 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked, int err = 0; struct tcp_rtt_storage val; - if (CHECK_FAIL(bpf_map_lookup_elem(map_fd, &client_fd, &val) < 0)) { - perror("Failed to read socket storage"); + if (!ASSERT_GE(bpf_map_lookup_elem(map_fd, &client_fd, &val), 0, "read socket storage")) return -1; - } if (val.invoked != invoked) { log_err("%s: unexpected bpf_tcp_sock.invoked %d != %d", @@ -151,14 +148,14 @@ void test_tcp_rtt(void) int server_fd, cgroup_fd; cgroup_fd = test__join_cgroup("/tcp_rtt"); - if (CHECK_FAIL(cgroup_fd < 0)) + if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /tcp_rtt")) return; server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0); - if (CHECK_FAIL(server_fd < 0)) + if (!ASSERT_GE(server_fd, 0, "start_server")) goto close_cgroup_fd; - CHECK_FAIL(run_test(cgroup_fd, server_fd)); + ASSERT_OK(run_test(cgroup_fd, server_fd), "run_test"); close(server_fd); From 9d0b05bdfbea25693cdd63c29aa12b982307d81e Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:10 +0800 Subject: [PATCH 122/143] selftests/bpf: Convert tcpbpf_user test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-11-git-send-email-wangyufen@huawei.com --- .../selftests/bpf/prog_tests/tcpbpf_user.c | 32 +++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c index 87923d2865b7..7e8fe1bad03f 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c +++ b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c @@ -8,8 +8,6 @@ #define LO_ADDR6 "::1" #define CG_NAME "/tcpbpf-user-test" -static __u32 duration; - static void verify_result(struct tcpbpf_globals *result) { __u32 expected_events = ((1 << BPF_SOCK_OPS_TIMEOUT_INIT) | @@ -22,9 +20,7 @@ static void verify_result(struct tcpbpf_globals *result) (1 << BPF_SOCK_OPS_TCP_LISTEN_CB)); /* check global map */ - CHECK(expected_events != result->event_map, "event_map", - "unexpected event_map: actual 0x%08x != expected 0x%08x\n", - result->event_map, expected_events); + ASSERT_EQ(expected_events, result->event_map, "event_map"); ASSERT_EQ(result->bytes_received, 501, "bytes_received"); ASSERT_EQ(result->bytes_acked, 1002, "bytes_acked"); @@ -56,18 +52,15 @@ static void run_test(struct tcpbpf_globals *result) int i, rv; listen_fd = start_server(AF_INET6, SOCK_STREAM, LO_ADDR6, 0, 0); - if (CHECK(listen_fd == -1, "start_server", "listen_fd:%d errno:%d\n", - listen_fd, errno)) + if (!ASSERT_NEQ(listen_fd, -1, "start_server")) goto done; cli_fd = connect_to_fd(listen_fd, 0); - if (CHECK(cli_fd == -1, "connect_to_fd(listen_fd)", - "cli_fd:%d errno:%d\n", cli_fd, errno)) + if (!ASSERT_NEQ(cli_fd, -1, "connect_to_fd(listen_fd)")) goto done; accept_fd = accept(listen_fd, NULL, NULL); - if (CHECK(accept_fd == -1, "accept(listen_fd)", - "accept_fd:%d errno:%d\n", accept_fd, errno)) + if (!ASSERT_NEQ(accept_fd, -1, "accept(listen_fd)")) goto done; /* Send 1000B of '+'s from cli_fd -> accept_fd */ @@ -75,11 +68,11 @@ static void run_test(struct tcpbpf_globals *result) buf[i] = '+'; rv = send(cli_fd, buf, 1000, 0); - if (CHECK(rv != 1000, "send(cli_fd)", "rv:%d errno:%d\n", rv, errno)) + if (!ASSERT_EQ(rv, 1000, "send(cli_fd)")) goto done; rv = recv(accept_fd, buf, 1000, 0); - if (CHECK(rv != 1000, "recv(accept_fd)", "rv:%d errno:%d\n", rv, errno)) + if (!ASSERT_EQ(rv, 1000, "recv(accept_fd)")) goto done; /* Send 500B of '.'s from accept_fd ->cli_fd */ @@ -87,11 +80,11 @@ static void run_test(struct tcpbpf_globals *result) buf[i] = '.'; rv = send(accept_fd, buf, 500, 0); - if (CHECK(rv != 500, "send(accept_fd)", "rv:%d errno:%d\n", rv, errno)) + if (!ASSERT_EQ(rv, 500, "send(accept_fd)")) goto done; rv = recv(cli_fd, buf, 500, 0); - if (CHECK(rv != 500, "recv(cli_fd)", "rv:%d errno:%d\n", rv, errno)) + if (!ASSERT_EQ(rv, 500, "recv(cli_fd)")) goto done; /* @@ -100,12 +93,12 @@ static void run_test(struct tcpbpf_globals *result) */ shutdown(accept_fd, SHUT_WR); err = recv(cli_fd, buf, 1, 0); - if (CHECK(err, "recv(cli_fd) for fin", "err:%d errno:%d\n", err, errno)) + if (!ASSERT_OK(err, "recv(cli_fd) for fin")) goto done; shutdown(cli_fd, SHUT_WR); err = recv(accept_fd, buf, 1, 0); - CHECK(err, "recv(accept_fd) for fin", "err:%d errno:%d\n", err, errno); + ASSERT_OK(err, "recv(accept_fd) for fin"); done: if (accept_fd != -1) close(accept_fd); @@ -124,12 +117,11 @@ void test_tcpbpf_user(void) int cg_fd = -1; skel = test_tcpbpf_kern__open_and_load(); - if (CHECK(!skel, "open and load skel", "failed")) + if (!ASSERT_OK_PTR(skel, "open and load skel")) return; cg_fd = test__join_cgroup(CG_NAME); - if (CHECK(cg_fd < 0, "test__join_cgroup(" CG_NAME ")", - "cg_fd:%d errno:%d", cg_fd, errno)) + if (!ASSERT_GE(cg_fd, 0, "test__join_cgroup(" CG_NAME ")")) goto err; skel->links.bpf_testcb = bpf_program__attach_cgroup(skel->progs.bpf_testcb, cg_fd); From 1fddca3d36d1dc4a19a8060d20de1b77edfe63e0 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 26 Sep 2022 13:12:11 +0800 Subject: [PATCH 123/143] selftests/bpf: Convert udp_limit test to ASSERT_* macros Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). Signed-off-by: Wang Yufen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1664169131-32405-12-git-send-email-wangyufen@huawei.com --- .../selftests/bpf/prog_tests/udp_limit.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/udp_limit.c b/tools/testing/selftests/bpf/prog_tests/udp_limit.c index 56c9d6bd38a3..2643d896ddae 100644 --- a/tools/testing/selftests/bpf/prog_tests/udp_limit.c +++ b/tools/testing/selftests/bpf/prog_tests/udp_limit.c @@ -5,8 +5,6 @@ #include #include -static int duration; - void test_udp_limit(void) { struct udp_limit *skel; @@ -14,11 +12,11 @@ void test_udp_limit(void) int cgroup_fd; cgroup_fd = test__join_cgroup("/udp_limit"); - if (CHECK(cgroup_fd < 0, "cg-join", "errno %d", errno)) + if (!ASSERT_GE(cgroup_fd, 0, "cg-join")) return; skel = udp_limit__open_and_load(); - if (CHECK(!skel, "skel-load", "errno %d", errno)) + if (!ASSERT_OK_PTR(skel, "skel-load")) goto close_cgroup_fd; skel->links.sock = bpf_program__attach_cgroup(skel->progs.sock, cgroup_fd); @@ -32,11 +30,11 @@ void test_udp_limit(void) * verify that. */ fd1 = socket(AF_INET, SOCK_DGRAM, 0); - if (CHECK(fd1 < 0, "fd1", "errno %d", errno)) + if (!ASSERT_GE(fd1, 0, "socket(fd1)")) goto close_skeleton; fd2 = socket(AF_INET, SOCK_DGRAM, 0); - if (CHECK(fd2 >= 0, "fd2", "errno %d", errno)) + if (!ASSERT_LT(fd2, 0, "socket(fd2)")) goto close_skeleton; /* We can reopen again after close. */ @@ -44,7 +42,7 @@ void test_udp_limit(void) fd1 = -1; fd1 = socket(AF_INET, SOCK_DGRAM, 0); - if (CHECK(fd1 < 0, "fd1-again", "errno %d", errno)) + if (!ASSERT_GE(fd1, 0, "socket(fd1-again)")) goto close_skeleton; /* Make sure the program was invoked the expected @@ -54,13 +52,11 @@ void test_udp_limit(void) * - close fd1 - BPF_CGROUP_INET_SOCK_RELEASE * - open fd1 again - BPF_CGROUP_INET_SOCK_CREATE */ - if (CHECK(skel->bss->invocations != 4, "bss-invocations", - "invocations=%d", skel->bss->invocations)) + if (!ASSERT_EQ(skel->bss->invocations, 4, "bss-invocations")) goto close_skeleton; /* We should still have a single socket in use */ - if (CHECK(skel->bss->in_use != 1, "bss-in_use", - "in_use=%d", skel->bss->in_use)) + if (!ASSERT_EQ(skel->bss->in_use, 1, "bss-in_use")) goto close_skeleton; close_skeleton: From 64696c40d03c01e0ea2e3e9aa1c490a7b6a1b6be Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:03 -0700 Subject: [PATCH 124/143] bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline The struct_ops prog is to allow using bpf to implement the functions in a struct (eg. kernel module). The current usage is to implement the tcp_congestion. The kernel does not call the tcp-cc's ops (ie. the bpf prog) in a recursive way. The struct_ops is sharing the tracing-trampoline's enter/exit function which tracks prog->active to avoid recursion. It is needed for tracing prog. However, it turns out the struct_ops bpf prog will hit this prog->active and unnecessarily skipped running the struct_ops prog. eg. The '.ssthresh' may run in_task() and then interrupted by softirq that runs the same '.ssthresh'. Skip running the '.ssthresh' will end up returning random value to the caller. The patch adds __bpf_prog_{enter,exit}_struct_ops for the struct_ops trampoline. They do not track the prog->active to detect recursion. One exception is when the tcp_congestion's '.init' ops is doing bpf_setsockopt(TCP_CONGESTION) and then recurs to the same '.init' ops. This will be addressed in the following patches. Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism") Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220929070407.965581-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 3 +++ include/linux/bpf.h | 4 ++++ kernel/bpf/trampoline.c | 23 +++++++++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 35796db58116..5b6230779cf3 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1836,6 +1836,9 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (p->aux->sleepable) { enter = __bpf_prog_enter_sleepable; exit = __bpf_prog_exit_sleepable; + } else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) { + enter = __bpf_prog_enter_struct_ops; + exit = __bpf_prog_exit_struct_ops; } else if (p->expected_attach_type == BPF_LSM_CGROUP) { enter = __bpf_prog_enter_lsm_cgroup; exit = __bpf_prog_exit_lsm_cgroup; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0f3eaf3ed98c..9e7d46d16032 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6f7b939321d6..bf0906e1e2b9 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -964,6 +964,29 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, rcu_read_unlock_trace(); } +u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return bpf_prog_start_time(); +} + +void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + update_prog_stats(prog, start); + migrate_enable(); + rcu_read_unlock(); +} + void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) { percpu_ref_get(&tr->pcref); From 37cfbe0bf2e85287350a6b0ca9521f5a4c7389ce Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:04 -0700 Subject: [PATCH 125/143] bpf: Move the "cdg" tcp-cc check to the common sol_tcp_sockopt() The check on the tcp-cc, "cdg", is done in the bpf_sk_setsockopt which is used by the bpf_tcp_ca, bpf_lsm, cg_sockopt, and tcp_iter hooks. However, it is not done for cg sock_ddr, cg sockops, and some of the bpf_lsm_cgroup hooks. The tcp-cc "cdg" should have very limited usage. This patch is to move the "cdg" check to the common sol_tcp_sockopt() so that all hooks have a consistent behavior. The motivation to make this check consistent now is because the latter patch will refactor the bpf_setsockopt(TCP_CONGESTION) into another function, so it is better to take this chance to refactor this piece also. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220929070407.965581-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 2fd9449026aa..f4cea3ff994a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5127,6 +5127,13 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, case TCP_CONGESTION: if (*optlen < 2) return -EINVAL; + /* "cdg" is the only cc that alloc a ptr + * in inet_csk_ca area. The bpf-tcp-cc may + * overwrite this ptr after switching to cdg. + */ + if (!getopt && *optlen >= sizeof("cdg") - 1 && + !strncmp("cdg", optval, *optlen)) + return -ENOTSUPP; break; case TCP_SAVED_SYN: if (*optlen < 1) @@ -5285,12 +5292,6 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { - if (level == SOL_TCP && optname == TCP_CONGESTION) { - if (optlen >= sizeof("cdg") - 1 && - !strncmp("cdg", optval, optlen)) - return -ENOTSUPP; - } - return _bpf_setsockopt(sk, level, optname, optval, optlen); } From 1e7d217faa11ac027f622124a3842aafbd0c4a42 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:05 -0700 Subject: [PATCH 126/143] bpf: Refactor bpf_setsockopt(TCP_CONGESTION) handling into another function This patch moves the bpf_setsockopt(TCP_CONGESTION) logic into another function. The next patch will add extra logic to avoid recursion and this will make the latter patch easier to follow. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220929070407.965581-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index f4cea3ff994a..96f2f7a65e65 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5102,6 +5102,33 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, return 0; } +static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, + int *optlen, bool getopt) +{ + if (*optlen < 2) + return -EINVAL; + + if (getopt) { + if (!inet_csk(sk)->icsk_ca_ops) + return -EINVAL; + /* BPF expects NULL-terminated tcp-cc string */ + optval[--(*optlen)] = '\0'; + return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + /* "cdg" is the only cc that alloc a ptr + * in inet_csk_ca area. The bpf-tcp-cc may + * overwrite this ptr after switching to cdg. + */ + if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) + return -ENOTSUPP; + + return do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), *optlen); +} + static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) @@ -5125,16 +5152,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, return -EINVAL; break; case TCP_CONGESTION: - if (*optlen < 2) - return -EINVAL; - /* "cdg" is the only cc that alloc a ptr - * in inet_csk_ca area. The bpf-tcp-cc may - * overwrite this ptr after switching to cdg. - */ - if (!getopt && *optlen >= sizeof("cdg") - 1 && - !strncmp("cdg", optval, *optlen)) - return -ENOTSUPP; - break; + return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); case TCP_SAVED_SYN: if (*optlen < 1) return -EINVAL; @@ -5159,13 +5177,6 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, return 0; } - if (optname == TCP_CONGESTION) { - if (!inet_csk(sk)->icsk_ca_ops) - return -EINVAL; - /* BPF expects NULL-terminated tcp-cc string */ - optval[--(*optlen)] = '\0'; - } - return do_tcp_getsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); From 061ff040710e9f6f043d1fa80b1b362d2845b17a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:06 -0700 Subject: [PATCH 127/143] bpf: tcp: Stop bpf_setsockopt(TCP_CONGESTION) in init ops to recur itself When a bad bpf prog '.init' calls bpf_setsockopt(TCP_CONGESTION, "itself"), it will trigger this loop: .init => bpf_setsockopt(tcp_cc) => .init => bpf_setsockopt(tcp_cc) ... ... => .init => bpf_setsockopt(tcp_cc). It was prevented by the prog->active counter before but the prog->active detection cannot be used in struct_ops as explained in the earlier patch of the set. In this patch, the second bpf_setsockopt(tcp_cc) is not allowed in order to break the loop. This is done by using a bit of an existing 1 byte hole in tcp_sock to check if there is on-going bpf_setsockopt(TCP_CONGESTION) in this tcp_sock. Note that this essentially limits only the first '.init' can call bpf_setsockopt(TCP_CONGESTION) to pick a fallback cc (eg. peer does not support ECN) and the second '.init' cannot fallback to another cc. This applies even the second bpf_setsockopt(TCP_CONGESTION) will not cause a loop. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220929070407.965581-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/tcp.h | 6 ++++++ net/core/filter.c | 28 +++++++++++++++++++++++++++- net/ipv4/tcp_minisocks.c | 1 + 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a9fbe22732c3..3bdf687e2fb3 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -388,6 +388,12 @@ struct tcp_sock { u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs * values defined in uapi/linux/tcp.h */ + u8 bpf_chg_cc_inprogress:1; /* In the middle of + * bpf_setsockopt(TCP_CONGESTION), + * it is to avoid the bpf_tcp_cc->init() + * to recur itself by calling + * bpf_setsockopt(TCP_CONGESTION, "itself"). + */ #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #else #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 diff --git a/net/core/filter.c b/net/core/filter.c index 96f2f7a65e65..ac4c45c02da5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5105,6 +5105,9 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, int *optlen, bool getopt) { + struct tcp_sock *tp; + int ret; + if (*optlen < 2) return -EINVAL; @@ -5125,8 +5128,31 @@ static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) return -ENOTSUPP; - return do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + /* It stops this looping + * + * .init => bpf_setsockopt(tcp_cc) => .init => + * bpf_setsockopt(tcp_cc)" => .init => .... + * + * The second bpf_setsockopt(tcp_cc) is not allowed + * in order to break the loop when both .init + * are the same bpf prog. + * + * This applies even the second bpf_setsockopt(tcp_cc) + * does not cause a loop. This limits only the first + * '.init' can call bpf_setsockopt(TCP_CONGESTION) to + * pick a fallback cc (eg. peer does not support ECN) + * and the second '.init' cannot fallback to + * another. + */ + tp = tcp_sk(sk); + if (tp->bpf_chg_cc_inprogress) + return -EBUSY; + + tp->bpf_chg_cc_inprogress = 1; + ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), *optlen); + tp->bpf_chg_cc_inprogress = 0; + return ret; } static int sol_tcp_sockopt(struct sock *sk, int optname, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index cb95d88497ae..ddcdc2bc4c04 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -541,6 +541,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); + newtp->bpf_chg_cc_inprogress = 0; tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); From 3411c5b6f8d6e08d98e606dcf74fc42e2f9d731f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:07 -0700 Subject: [PATCH 128/143] selftests/bpf: Check -EBUSY for the recurred bpf_setsockopt(TCP_CONGESTION) This patch changes the bpf_dctcp test to ensure the recurred bpf_setsockopt(TCP_CONGESTION) returns -EBUSY. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220929070407.965581-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 4 +++ tools/testing/selftests/bpf/progs/bpf_dctcp.c | 25 +++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 2959a52ced06..e980188d4124 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -290,6 +290,10 @@ static void test_dctcp_fallback(void) goto done; ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res"); ASSERT_EQ(dctcp_skel->bss->tcp_cdg_res, -ENOTSUPP, "tcp_cdg_res"); + /* All setsockopt(TCP_CONGESTION) in the recurred + * bpf_dctcp->init() should fail with -EBUSY. + */ + ASSERT_EQ(dctcp_skel->bss->ebusy_cnt, 3, "ebusy_cnt"); err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len); if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 9573be6122be..460682759aed 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include "bpf_tcp_helpers.h" @@ -23,6 +24,7 @@ const char tcp_cdg[] = "cdg"; char cc_res[TCP_CA_NAME_MAX]; int tcp_cdg_res = 0; int stg_result = 0; +int ebusy_cnt = 0; struct { __uint(type, BPF_MAP_TYPE_SK_STORAGE); @@ -64,16 +66,23 @@ void BPF_PROG(dctcp_init, struct sock *sk) if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { /* Switch to fallback */ - bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, - (void *)fallback, sizeof(fallback)); - /* Switch back to myself which the bpf trampoline - * stopped calling dctcp_init recursively. + if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)fallback, sizeof(fallback)) == -EBUSY) + ebusy_cnt++; + + /* Switch back to myself and the recurred dctcp_init() + * will get -EBUSY for all bpf_setsockopt(TCP_CONGESTION), + * except the last "cdg" one. */ - bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, - (void *)bpf_dctcp, sizeof(bpf_dctcp)); + if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)bpf_dctcp, sizeof(bpf_dctcp)) == -EBUSY) + ebusy_cnt++; + /* Switch back to fallback */ - bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, - (void *)fallback, sizeof(fallback)); + if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)fallback, sizeof(fallback)) == -EBUSY) + ebusy_cnt++; + /* Expecting -ENOTSUPP for tcp_cdg_res */ tcp_cdg_res = bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, (void *)tcp_cdg, sizeof(tcp_cdg)); From 6166da0a02cde26c065692d0c05eb685178fee75 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Tue, 27 Sep 2022 18:59:44 +0000 Subject: [PATCH 129/143] bpf, docs: Move legacy packet instructions to a separate file Move legacy packet instructions to a separate file. Signed-off-by: Dave Thaler Link: https://lore.kernel.org/r/20220927185958.14995-1-dthaler1968@googlemail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/instruction-set.rst | 38 ++-------------- Documentation/bpf/linux-notes.rst | 65 +++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 35 deletions(-) create mode 100644 Documentation/bpf/linux-notes.rst diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 1b0e6711dec9..352f25a1eb17 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -282,8 +282,6 @@ arithmetic operations in the imm field to encode the atomic operation: *(u64 *)(dst_reg + off16) += src_reg -``BPF_XADD`` is a deprecated name for ``BPF_ATOMIC | BPF_ADD``. - In addition to the simple atomic operations, there also is a modifier and two complex atomic operations: @@ -331,36 +329,6 @@ There is currently only one such instruction. Legacy BPF Packet access instructions ------------------------------------- -eBPF has special instructions for access to packet data that have been -carried over from classic BPF to retain the performance of legacy socket -filters running in the eBPF interpreter. - -The instructions come in two forms: ``BPF_ABS | | BPF_LD`` and -``BPF_IND | | BPF_LD``. - -These instructions are used to access packet data and can only be used when -the program context is a pointer to networking packet. ``BPF_ABS`` -accesses packet data at an absolute offset specified by the immediate data -and ``BPF_IND`` access packet data at an offset that includes the value of -a register in addition to the immediate data. - -These instructions have seven implicit operands: - - * Register R6 is an implicit input that must contain pointer to a - struct sk_buff. - * Register R0 is an implicit output which contains the data fetched from - the packet. - * Registers R1-R5 are scratch registers that are clobbered after a call to - ``BPF_ABS | BPF_LD`` or ``BPF_IND | BPF_LD`` instructions. - -These instructions have an implicit program exit condition as well. When an -eBPF program is trying to access the data beyond the packet boundary, the -program execution will be aborted. - -``BPF_ABS | BPF_W | BPF_LD`` means:: - - R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + imm32)) - -``BPF_IND | BPF_W | BPF_LD`` means:: - - R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32)) +eBPF previously introduced special instructions for access to packet data that were +carried over from classic BPF. However, these instructions are +deprecated and should no longer be used. diff --git a/Documentation/bpf/linux-notes.rst b/Documentation/bpf/linux-notes.rst new file mode 100644 index 000000000000..93c01386d92c --- /dev/null +++ b/Documentation/bpf/linux-notes.rst @@ -0,0 +1,65 @@ +.. contents:: +.. sectnum:: + +========================== +Linux implementation notes +========================== + +This document provides more details specific to the Linux kernel implementation of the eBPF instruction set. + +Legacy BPF Packet access instructions +===================================== + +As mentioned in the `ISA standard documentation `_, +Linux has special eBPF instructions for access to packet data that have been +carried over from classic BPF to retain the performance of legacy socket +filters running in the eBPF interpreter. + +The instructions come in two forms: ``BPF_ABS | | BPF_LD`` and +``BPF_IND | | BPF_LD``. + +These instructions are used to access packet data and can only be used when +the program context is a pointer to a networking packet. ``BPF_ABS`` +accesses packet data at an absolute offset specified by the immediate data +and ``BPF_IND`` access packet data at an offset that includes the value of +a register in addition to the immediate data. + +These instructions have seven implicit operands: + +* Register R6 is an implicit input that must contain a pointer to a + struct sk_buff. +* Register R0 is an implicit output which contains the data fetched from + the packet. +* Registers R1-R5 are scratch registers that are clobbered by the + instruction. + +These instructions have an implicit program exit condition as well. If an +eBPF program attempts access data beyond the packet boundary, the +program execution will be aborted. + +``BPF_ABS | BPF_W | BPF_LD`` (0x20) means:: + + R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + imm)) + +where ``ntohl()`` converts a 32-bit value from network byte order to host byte order. + +``BPF_IND | BPF_W | BPF_LD`` (0x40) means:: + + R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + src + imm)) + +Appendix +======== + +For reference, the following table lists legacy Linux-specific opcodes in order by value. + +====== ==== =================================================== ============= +opcode imm description reference +====== ==== =================================================== ============= +0x20 any dst = ntohl(\*(uint32_t \*)(R6->data + imm)) `Legacy BPF Packet access instructions`_ +0x28 any dst = ntohs(\*(uint16_t \*)(R6->data + imm)) `Legacy BPF Packet access instructions`_ +0x30 any dst = (\*(uint8_t \*)(R6->data + imm)) `Legacy BPF Packet access instructions`_ +0x38 any dst = ntohll(\*(uint64_t \*)(R6->data + imm)) `Legacy BPF Packet access instructions`_ +0x40 any dst = ntohl(\*(uint32_t \*)(R6->data + src + imm)) `Legacy BPF Packet access instructions`_ +0x48 any dst = ntohs(\*(uint16_t \*)(R6->data + src + imm)) `Legacy BPF Packet access instructions`_ +0x50 any dst = \*(uint8_t \*)(R6->data + src + imm)) `Legacy BPF Packet access instructions`_ +0x58 any dst = ntohll(\*(uint64_t \*)(R6->data + src + imm)) `Legacy BPF Packet access instructions`_ From 9a0bf21337c667375d918adc41239ce54304a12c Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Tue, 27 Sep 2022 18:59:45 +0000 Subject: [PATCH 130/143] bpf, docs: Linux byteswap note Add Linux byteswap note. Signed-off-by: Dave Thaler Link: https://lore.kernel.org/r/20220927185958.14995-2-dthaler1968@googlemail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/instruction-set.rst | 4 ---- Documentation/bpf/linux-notes.rst | 5 +++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 352f25a1eb17..1735b91ec4c7 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -156,10 +156,6 @@ Examples: dst_reg = htobe64(dst_reg) -``BPF_FROM_LE`` and ``BPF_FROM_BE`` exist as aliases for ``BPF_TO_LE`` and -``BPF_TO_BE`` respectively. - - Jump instructions ----------------- diff --git a/Documentation/bpf/linux-notes.rst b/Documentation/bpf/linux-notes.rst index 93c01386d92c..1c31379b469f 100644 --- a/Documentation/bpf/linux-notes.rst +++ b/Documentation/bpf/linux-notes.rst @@ -7,6 +7,11 @@ Linux implementation notes This document provides more details specific to the Linux kernel implementation of the eBPF instruction set. +Byte swap instructions +====================== + +``BPF_FROM_LE`` and ``BPF_FROM_BE`` exist as aliases for ``BPF_TO_LE`` and ``BPF_TO_BE`` respectively. + Legacy BPF Packet access instructions ===================================== From 6c7aaffb24efbd5d1ae067b2b629b3ffcc37e18e Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Tue, 27 Sep 2022 18:59:46 +0000 Subject: [PATCH 131/143] bpf, docs: Move Clang notes to a separate file Move Clang notes to a separate file. Signed-off-by: Dave Thaler Link: https://lore.kernel.org/r/20220927185958.14995-3-dthaler1968@googlemail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/clang-notes.rst | 24 ++++++++++++++++++++++++ Documentation/bpf/instruction-set.rst | 6 ------ 2 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 Documentation/bpf/clang-notes.rst diff --git a/Documentation/bpf/clang-notes.rst b/Documentation/bpf/clang-notes.rst new file mode 100644 index 000000000000..b15179cb5117 --- /dev/null +++ b/Documentation/bpf/clang-notes.rst @@ -0,0 +1,24 @@ +.. contents:: +.. sectnum:: + +========================== +Clang implementation notes +========================== + +This document provides more details specific to the Clang/LLVM implementation of the eBPF instruction set. + +Versions +======== + +Clang defined "CPU" versions, where a CPU version of 3 corresponds to the current eBPF ISA. + +Clang can select the eBPF ISA version using ``-mcpu=v3`` for example to select version 3. + +Atomic operations +================= + +Clang can generate atomic instructions by default when ``-mcpu=v3`` is +enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction +Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable +the atomics features, while keeping a lower ``-mcpu`` version, you can use +``-Xclang -target-feature -Xclang +alu32``. diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 1735b91ec4c7..541483118f65 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -303,12 +303,6 @@ The ``BPF_CMPXCHG`` operation atomically compares the value addressed by value that was at ``dst_reg + off`` before the operation is zero-extended and loaded back to ``R0``. -Clang can generate atomic instructions by default when ``-mcpu=v3`` is -enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction -Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable -the atomics features, while keeping a lower ``-mcpu`` version, you can use -``-Xclang -target-feature -Xclang +alu32``. - 64-bit immediate instructions ----------------------------- From ee159bdbdbce293e66d7b9249208f367faff5d81 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Tue, 27 Sep 2022 18:59:47 +0000 Subject: [PATCH 132/143] bpf, docs: Add Clang note about BPF_ALU Add Clang note about BPF_ALU. Signed-off-by: Dave Thaler Link: https://lore.kernel.org/r/20220927185958.14995-4-dthaler1968@googlemail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/clang-notes.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/bpf/clang-notes.rst b/Documentation/bpf/clang-notes.rst index b15179cb5117..528feddf2db9 100644 --- a/Documentation/bpf/clang-notes.rst +++ b/Documentation/bpf/clang-notes.rst @@ -14,6 +14,12 @@ Clang defined "CPU" versions, where a CPU version of 3 corresponds to the curren Clang can select the eBPF ISA version using ``-mcpu=v3`` for example to select version 3. +Arithmetic instructions +======================= + +For CPU versions prior to 3, Clang v7.0 and later can enable ``BPF_ALU`` support with +``-Xclang -target-feature -Xclang +alu32``. In CPU version 3, support is automatically included. + Atomic operations ================= From 5a8921ba96ceaec0c00c8855e48940d2739c5c3b Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Tue, 27 Sep 2022 18:59:48 +0000 Subject: [PATCH 133/143] bpf, docs: Add TOC and fix formatting. Add TOC and fix formatting. Signed-off-by: Dave Thaler Link: https://lore.kernel.org/r/20220927185958.14995-5-dthaler1968@googlemail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/instruction-set.rst | 262 +++++++++++++------------- 1 file changed, 133 insertions(+), 129 deletions(-) diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 541483118f65..4997d2088fef 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -1,7 +1,12 @@ +.. contents:: +.. sectnum:: + +======================================== +eBPF Instruction Set Specification, v1.0 +======================================== + +This document specifies version 1.0 of the eBPF instruction set. -==================== -eBPF Instruction Set -==================== Registers and calling convention ================================ @@ -11,10 +16,10 @@ all of which are 64-bits wide. The eBPF calling convention is defined as: - * R0: return value from function calls, and exit value for eBPF programs - * R1 - R5: arguments for function calls - * R6 - R9: callee saved registers that function calls will preserve - * R10: read-only frame pointer to access stack +* R0: return value from function calls, and exit value for eBPF programs +* R1 - R5: arguments for function calls +* R6 - R9: callee saved registers that function calls will preserve +* R10: read-only frame pointer to access stack R0 - R5 are scratch registers and eBPF programs needs to spill/fill them if necessary across calls. @@ -24,17 +29,17 @@ Instruction encoding eBPF has two instruction encodings: - * the basic instruction encoding, which uses 64 bits to encode an instruction - * the wide instruction encoding, which appends a second 64-bit immediate value - (imm64) after the basic instruction for a total of 128 bits. +* the basic instruction encoding, which uses 64 bits to encode an instruction +* the wide instruction encoding, which appends a second 64-bit immediate value + (imm64) after the basic instruction for a total of 128 bits. The basic instruction encoding looks as follows: - ============= ======= =============== ==================== ============ - 32 bits (MSB) 16 bits 4 bits 4 bits 8 bits (LSB) - ============= ======= =============== ==================== ============ - immediate offset source register destination register opcode - ============= ======= =============== ==================== ============ +============= ======= =============== ==================== ============ +32 bits (MSB) 16 bits 4 bits 4 bits 8 bits (LSB) +============= ======= =============== ==================== ============ +immediate offset source register destination register opcode +============= ======= =============== ==================== ============ Note that most instructions do not use all of the fields. Unused fields shall be cleared to zero. @@ -44,30 +49,30 @@ Instruction classes The three LSB bits of the 'opcode' field store the instruction class: - ========= ===== =============================== - class value description - ========= ===== =============================== - BPF_LD 0x00 non-standard load operations - BPF_LDX 0x01 load into register operations - BPF_ST 0x02 store from immediate operations - BPF_STX 0x03 store from register operations - BPF_ALU 0x04 32-bit arithmetic operations - BPF_JMP 0x05 64-bit jump operations - BPF_JMP32 0x06 32-bit jump operations - BPF_ALU64 0x07 64-bit arithmetic operations - ========= ===== =============================== +========= ===== =============================== =================================== +class value description reference +========= ===== =============================== =================================== +BPF_LD 0x00 non-standard load operations `Load and store instructions`_ +BPF_LDX 0x01 load into register operations `Load and store instructions`_ +BPF_ST 0x02 store from immediate operations `Load and store instructions`_ +BPF_STX 0x03 store from register operations `Load and store instructions`_ +BPF_ALU 0x04 32-bit arithmetic operations `Arithmetic and jump instructions`_ +BPF_JMP 0x05 64-bit jump operations `Arithmetic and jump instructions`_ +BPF_JMP32 0x06 32-bit jump operations `Arithmetic and jump instructions`_ +BPF_ALU64 0x07 64-bit arithmetic operations `Arithmetic and jump instructions`_ +========= ===== =============================== =================================== Arithmetic and jump instructions ================================ -For arithmetic and jump instructions (BPF_ALU, BPF_ALU64, BPF_JMP and -BPF_JMP32), the 8-bit 'opcode' field is divided into three parts: +For arithmetic and jump instructions (``BPF_ALU``, ``BPF_ALU64``, ``BPF_JMP`` and +``BPF_JMP32``), the 8-bit 'opcode' field is divided into three parts: - ============== ====== ================= - 4 bits (MSB) 1 bit 3 bits (LSB) - ============== ====== ================= - operation code source instruction class - ============== ====== ================= +============== ====== ================= +4 bits (MSB) 1 bit 3 bits (LSB) +============== ====== ================= +operation code source instruction class +============== ====== ================= The 4th bit encodes the source operand: @@ -84,51 +89,51 @@ The four MSB bits store the operation code. Arithmetic instructions ----------------------- -BPF_ALU uses 32-bit wide operands while BPF_ALU64 uses 64-bit wide operands for +``BPF_ALU`` uses 32-bit wide operands while ``BPF_ALU64`` uses 64-bit wide operands for otherwise identical operations. -The code field encodes the operation as below: +The 'code' field encodes the operation as below: - ======== ===== ================================================= - code value description - ======== ===== ================================================= - BPF_ADD 0x00 dst += src - BPF_SUB 0x10 dst -= src - BPF_MUL 0x20 dst \*= src - BPF_DIV 0x30 dst /= src - BPF_OR 0x40 dst \|= src - BPF_AND 0x50 dst &= src - BPF_LSH 0x60 dst <<= src - BPF_RSH 0x70 dst >>= src - BPF_NEG 0x80 dst = ~src - BPF_MOD 0x90 dst %= src - BPF_XOR 0xa0 dst ^= src - BPF_MOV 0xb0 dst = src - BPF_ARSH 0xc0 sign extending shift right - BPF_END 0xd0 byte swap operations (see separate section below) - ======== ===== ================================================= +======== ===== ========================================================== +code value description +======== ===== ========================================================== +BPF_ADD 0x00 dst += src +BPF_SUB 0x10 dst -= src +BPF_MUL 0x20 dst \*= src +BPF_DIV 0x30 dst /= src +BPF_OR 0x40 dst \|= src +BPF_AND 0x50 dst &= src +BPF_LSH 0x60 dst <<= src +BPF_RSH 0x70 dst >>= src +BPF_NEG 0x80 dst = ~src +BPF_MOD 0x90 dst %= src +BPF_XOR 0xa0 dst ^= src +BPF_MOV 0xb0 dst = src +BPF_ARSH 0xc0 sign extending shift right +BPF_END 0xd0 byte swap operations (see `Byte swap instructions`_ below) +======== ===== ========================================================== -BPF_ADD | BPF_X | BPF_ALU means:: +``BPF_ADD | BPF_X | BPF_ALU`` means:: dst_reg = (u32) dst_reg + (u32) src_reg; -BPF_ADD | BPF_X | BPF_ALU64 means:: +``BPF_ADD | BPF_X | BPF_ALU64`` means:: dst_reg = dst_reg + src_reg -BPF_XOR | BPF_K | BPF_ALU means:: +``BPF_XOR | BPF_K | BPF_ALU`` means:: src_reg = (u32) src_reg ^ (u32) imm32 -BPF_XOR | BPF_K | BPF_ALU64 means:: +``BPF_XOR | BPF_K | BPF_ALU64`` means:: src_reg = src_reg ^ imm32 Byte swap instructions ----------------------- +~~~~~~~~~~~~~~~~~~~~~~ The byte swap instructions use an instruction class of ``BPF_ALU`` and a 4-bit -code field of ``BPF_END``. +'code' field of ``BPF_END``. The byte swap instructions operate on the destination register only and do not use a separate source register or immediate value. @@ -136,14 +141,14 @@ only and do not use a separate source register or immediate value. The 1-bit source operand field in the opcode is used to to select what byte order the operation convert from or to: - ========= ===== ================================================= - source value description - ========= ===== ================================================= - BPF_TO_LE 0x00 convert between host byte order and little endian - BPF_TO_BE 0x08 convert between host byte order and big endian - ========= ===== ================================================= +========= ===== ================================================= +source value description +========= ===== ================================================= +BPF_TO_LE 0x00 convert between host byte order and little endian +BPF_TO_BE 0x08 convert between host byte order and big endian +========= ===== ================================================= -The imm field encodes the width of the swap operations. The following widths +The 'imm' field encodes the width of the swap operations. The following widths are supported: 16, 32 and 64. Examples: @@ -159,28 +164,28 @@ Examples: Jump instructions ----------------- -BPF_JMP32 uses 32-bit wide operands while BPF_JMP uses 64-bit wide operands for +``BPF_JMP32`` uses 32-bit wide operands while ``BPF_JMP`` uses 64-bit wide operands for otherwise identical operations. -The code field encodes the operation as below: +The 'code' field encodes the operation as below: - ======== ===== ========================= ============ - code value description notes - ======== ===== ========================= ============ - BPF_JA 0x00 PC += off BPF_JMP only - BPF_JEQ 0x10 PC += off if dst == src - BPF_JGT 0x20 PC += off if dst > src unsigned - BPF_JGE 0x30 PC += off if dst >= src unsigned - BPF_JSET 0x40 PC += off if dst & src - BPF_JNE 0x50 PC += off if dst != src - BPF_JSGT 0x60 PC += off if dst > src signed - BPF_JSGE 0x70 PC += off if dst >= src signed - BPF_CALL 0x80 function call - BPF_EXIT 0x90 function / program return BPF_JMP only - BPF_JLT 0xa0 PC += off if dst < src unsigned - BPF_JLE 0xb0 PC += off if dst <= src unsigned - BPF_JSLT 0xc0 PC += off if dst < src signed - BPF_JSLE 0xd0 PC += off if dst <= src signed - ======== ===== ========================= ============ +======== ===== ========================= ============ +code value description notes +======== ===== ========================= ============ +BPF_JA 0x00 PC += off BPF_JMP only +BPF_JEQ 0x10 PC += off if dst == src +BPF_JGT 0x20 PC += off if dst > src unsigned +BPF_JGE 0x30 PC += off if dst >= src unsigned +BPF_JSET 0x40 PC += off if dst & src +BPF_JNE 0x50 PC += off if dst != src +BPF_JSGT 0x60 PC += off if dst > src signed +BPF_JSGE 0x70 PC += off if dst >= src signed +BPF_CALL 0x80 function call +BPF_EXIT 0x90 function / program return BPF_JMP only +BPF_JLT 0xa0 PC += off if dst < src unsigned +BPF_JLE 0xb0 PC += off if dst <= src unsigned +BPF_JSLT 0xc0 PC += off if dst < src signed +BPF_JSLE 0xd0 PC += off if dst <= src signed +======== ===== ========================= ============ The eBPF program needs to store the return value into register R0 before doing a BPF_EXIT. @@ -189,14 +194,26 @@ BPF_EXIT. Load and store instructions =========================== -For load and store instructions (BPF_LD, BPF_LDX, BPF_ST and BPF_STX), the +For load and store instructions (``BPF_LD``, ``BPF_LDX``, ``BPF_ST``, and ``BPF_STX``), the 8-bit 'opcode' field is divided as: - ============ ====== ================= - 3 bits (MSB) 2 bits 3 bits (LSB) - ============ ====== ================= - mode size instruction class - ============ ====== ================= +============ ====== ================= +3 bits (MSB) 2 bits 3 bits (LSB) +============ ====== ================= +mode size instruction class +============ ====== ================= + +The mode modifier is one of: + + ============= ===== ==================================== ============= + mode modifier value description reference + ============= ===== ==================================== ============= + BPF_IMM 0x00 64-bit immediate instructions `64-bit immediate instructions`_ + BPF_ABS 0x20 legacy BPF packet access (absolute) `Legacy BPF Packet access instructions`_ + BPF_IND 0x40 legacy BPF packet access (indirect) `Legacy BPF Packet access instructions`_ + BPF_MEM 0x60 regular load and store operations `Regular load and store operations`_ + BPF_ATOMIC 0xc0 atomic operations `Atomic operations`_ + ============= ===== ==================================== ============= The size modifier is one of: @@ -209,19 +226,6 @@ The size modifier is one of: BPF_DW 0x18 double word (8 bytes) ============= ===== ===================== -The mode modifier is one of: - - ============= ===== ==================================== - mode modifier value description - ============= ===== ==================================== - BPF_IMM 0x00 64-bit immediate instructions - BPF_ABS 0x20 legacy BPF packet access (absolute) - BPF_IND 0x40 legacy BPF packet access (indirect) - BPF_MEM 0x60 regular load and store operations - BPF_ATOMIC 0xc0 atomic operations - ============= ===== ==================================== - - Regular load and store operations --------------------------------- @@ -252,42 +256,42 @@ by other eBPF programs or means outside of this specification. All atomic operations supported by eBPF are encoded as store operations that use the ``BPF_ATOMIC`` mode modifier as follows: - * ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations - * ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations - * 8-bit and 16-bit wide atomic operations are not supported. +* ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations +* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations +* 8-bit and 16-bit wide atomic operations are not supported. -The imm field is used to encode the actual atomic operation. +The 'imm' field is used to encode the actual atomic operation. Simple atomic operation use a subset of the values defined to encode -arithmetic operations in the imm field to encode the atomic operation: +arithmetic operations in the 'imm' field to encode the atomic operation: - ======== ===== =========== - imm value description - ======== ===== =========== - BPF_ADD 0x00 atomic add - BPF_OR 0x40 atomic or - BPF_AND 0x50 atomic and - BPF_XOR 0xa0 atomic xor - ======== ===== =========== +======== ===== =========== +imm value description +======== ===== =========== +BPF_ADD 0x00 atomic add +BPF_OR 0x40 atomic or +BPF_AND 0x50 atomic and +BPF_XOR 0xa0 atomic xor +======== ===== =========== -``BPF_ATOMIC | BPF_W | BPF_STX`` with imm = BPF_ADD means:: +``BPF_ATOMIC | BPF_W | BPF_STX`` with 'imm' = BPF_ADD means:: *(u32 *)(dst_reg + off16) += src_reg -``BPF_ATOMIC | BPF_DW | BPF_STX`` with imm = BPF ADD means:: +``BPF_ATOMIC | BPF_DW | BPF_STX`` with 'imm' = BPF ADD means:: *(u64 *)(dst_reg + off16) += src_reg In addition to the simple atomic operations, there also is a modifier and two complex atomic operations: - =========== ================ =========================== - imm value description - =========== ================ =========================== - BPF_FETCH 0x01 modifier: return old value - BPF_XCHG 0xe0 | BPF_FETCH atomic exchange - BPF_CMPXCHG 0xf0 | BPF_FETCH atomic compare and exchange - =========== ================ =========================== +=========== ================ =========================== +imm value description +=========== ================ =========================== +BPF_FETCH 0x01 modifier: return old value +BPF_XCHG 0xe0 | BPF_FETCH atomic exchange +BPF_CMPXCHG 0xf0 | BPF_FETCH atomic compare and exchange +=========== ================ =========================== The ``BPF_FETCH`` modifier is optional for simple atomic operations, and always set for the complex atomic operations. If the ``BPF_FETCH`` flag @@ -306,7 +310,7 @@ and loaded back to ``R0``. 64-bit immediate instructions ----------------------------- -Instructions with the ``BPF_IMM`` mode modifier use the wide instruction +Instructions with the ``BPF_IMM`` 'mode' modifier use the wide instruction encoding for an extra imm64 value. There is currently only one such instruction. From d863f42930db35e82f47e4b4c78531a2b8d396ae Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Wed, 28 Sep 2022 09:04:39 +0000 Subject: [PATCH 134/143] bpftool: Remove unused struct btf_attach_point After commit 2828d0d75b73 ("bpftool: Switch to libbpf's hashmap for programs/maps in BTF listing"), struct btf_attach_point is not used anymore and can be removed as well. Signed-off-by: Yuan Can Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220928090440.79637-2-yuancan@huawei.com --- tools/bpf/bpftool/btf.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 0744bd1150be..64411fe49a66 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -43,11 +43,6 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_ENUM64] = "ENUM64", }; -struct btf_attach_point { - __u32 obj_id; - __u32 btf_id; -}; - static const char *btf_int_enc_str(__u8 encoding) { switch (encoding) { From f95a479797dc2c65fdf2809a7c388e7a9e2bc853 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Wed, 28 Sep 2022 09:04:40 +0000 Subject: [PATCH 135/143] bpftool: Remove unused struct event_ring_info After commit 9b190f185d2f ("tools/bpftool: switch map event_pipe to libbpf's perf_buffer"), struct event_ring_info is not used any more and can be removed as well. Signed-off-by: Yuan Can Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220928090440.79637-3-yuancan@huawei.com --- tools/bpf/bpftool/map_perf_ring.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c index 6b0c410152de..1583281d1327 100644 --- a/tools/bpf/bpftool/map_perf_ring.c +++ b/tools/bpf/bpftool/map_perf_ring.c @@ -29,13 +29,6 @@ static volatile bool stop; -struct event_ring_info { - int fd; - int key; - unsigned int cpu; - void *mem; -}; - struct perf_event_sample { struct perf_event_header header; __u64 time; From b59cc7fcbaebde52ed97f63c6c50e49b8dd5be37 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Tue, 27 Sep 2022 15:25:27 -0400 Subject: [PATCH 136/143] samples/bpf: Fix typo in xdp_router_ipv4 sample Fix typo in xdp_router_ipv4 sample. Signed-off-by: Deming Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220927192527.8722-1-wangdeming@inspur.com --- samples/bpf/xdp_router_ipv4_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index 294fc15ad1cb..683913bbf279 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -209,7 +209,7 @@ static void read_route(struct nlmsghdr *nh, int nll) /* Rereading the route table to check if * there is an entry with the same * prefix but a different metric as the - * deleted enty. + * deleted entry. */ get_route_table(AF_INET); } else if (prefix_key->data[0] == From 2efcf695bfc0f078dd7d5d23d96a97db34c930d5 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 28 Sep 2022 23:15:55 +0100 Subject: [PATCH 137/143] selftests/bpf: Fix spelling mistake "unpriviledged" -> "unprivileged" There are a couple of spelling mistakes, one in a literal string and one in a comment. Fix them. Signed-off-by: Colin Ian King Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220928221555.67873-1-colin.i.king@gmail.com --- tools/testing/selftests/bpf/verifier/calls.c | 2 +- tools/testing/selftests/bpf/verifier/var_off.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 3fb4f69b1962..e1a937277b54 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -284,7 +284,7 @@ .result = ACCEPT, }, { - "calls: not on unpriviledged", + "calls: not on unprivileged", .insns = { BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), BPF_MOV64_IMM(BPF_REG_0, 1), diff --git a/tools/testing/selftests/bpf/verifier/var_off.c b/tools/testing/selftests/bpf/verifier/var_off.c index 187c6f6e32bc..d37f512fad16 100644 --- a/tools/testing/selftests/bpf/verifier/var_off.c +++ b/tools/testing/selftests/bpf/verifier/var_off.c @@ -121,7 +121,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 1 }, - /* The unpriviledged case is not too interesting; variable + /* The unprivileged case is not too interesting; variable * stack access is rejected. */ .errstr_unpriv = "R2 variable stack access prohibited for !root", From 51e05a8cf8eb34da7473823b7f236a77adfef0b4 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Fri, 30 Sep 2022 17:07:08 +0800 Subject: [PATCH 138/143] libbpf: Fix overrun in netlink attribute iteration I accidentally found that a change in commit 1045b03e07d8 ("netlink: fix overrun in attribute iteration") was not synchronized to the function `nla_ok` in tools/lib/bpf/nlattr.c, I think it is necessary to modify, this patch will do it. Signed-off-by: Xin Liu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220930090708.62394-1-liuxin350@huawei.com --- tools/lib/bpf/nlattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c index f57e77a6e40f..3900d052ed19 100644 --- a/tools/lib/bpf/nlattr.c +++ b/tools/lib/bpf/nlattr.c @@ -32,7 +32,7 @@ static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) static int nla_ok(const struct nlattr *nla, int remaining) { - return remaining >= sizeof(*nla) && + return remaining >= (int)sizeof(*nla) && nla->nla_len >= sizeof(*nla) && nla->nla_len <= remaining; } From 3ca2fb497440a3c8294f9df0ce7b2c3c9a1c5875 Mon Sep 17 00:00:00 2001 From: Tianyi Liu Date: Wed, 28 Sep 2022 16:09:32 +0800 Subject: [PATCH 139/143] bpftool: Fix error message of strerror strerror() expects a positive errno, however variable err will never be positive when an error occurs. This causes bpftool to output too many "unknown error", even a simple "file not exist" error can not get an accurate message. This patch fixed all "strerror(err)" patterns in bpftool. Specially in btf.c#L823, hashmap__append() is an internal function of libbpf and will not change errno, so there's a little difference. Some libbpf_get_error() calls are kept for return values. Changes since v1: https://lore.kernel.org/bpf/SY4P282MB1084B61CD8671DFA395AA8579D539@SY4P282MB1084.AUSP282.PROD.OUTLOOK.COM/ Check directly for NULL values instead of calling libbpf_get_error(). Signed-off-by: Tianyi Liu Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/SY4P282MB1084AD9CD84A920F08DF83E29D549@SY4P282MB1084.AUSP282.PROD.OUTLOOK.COM --- tools/bpf/bpftool/btf.c | 11 +++++------ tools/bpf/bpftool/gen.c | 4 ++-- tools/bpf/bpftool/map_perf_ring.c | 7 +++---- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 64411fe49a66..68a70ac03c80 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -635,10 +635,9 @@ static int do_dump(int argc, char **argv) btf = btf__parse_split(*argv, base ?: base_btf); err = libbpf_get_error(btf); - if (err) { - btf = NULL; + if (!btf) { p_err("failed to load BTF from %s: %s", - *argv, strerror(err)); + *argv, strerror(errno)); goto done; } NEXT_ARG(); @@ -683,8 +682,8 @@ static int do_dump(int argc, char **argv) btf = btf__load_from_kernel_by_id_split(btf_id, base_btf); err = libbpf_get_error(btf); - if (err) { - p_err("get btf by id (%u): %s", btf_id, strerror(err)); + if (!btf) { + p_err("get btf by id (%u): %s", btf_id, strerror(errno)); goto done; } } @@ -820,7 +819,7 @@ build_btf_type_table(struct hashmap *tab, enum bpf_obj_type type, u32_as_hash_field(id)); if (err) { p_err("failed to append entry to hashmap for BTF ID %u, object ID %u: %s", - btf_id, id, strerror(errno)); + btf_id, id, strerror(-err)); goto err_free; } } diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 7070dcffa822..cf8b4e525c88 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -1594,14 +1594,14 @@ static int do_object(int argc, char **argv) err = bpf_linker__add_file(linker, file, NULL); if (err) { - p_err("failed to link '%s': %s (%d)", file, strerror(err), err); + p_err("failed to link '%s': %s (%d)", file, strerror(errno), errno); goto out; } } err = bpf_linker__finalize(linker); if (err) { - p_err("failed to finalize ELF file: %s (%d)", strerror(err), err); + p_err("failed to finalize ELF file: %s (%d)", strerror(errno), errno); goto out; } diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c index 1583281d1327..21d7d447e1f3 100644 --- a/tools/bpf/bpftool/map_perf_ring.c +++ b/tools/bpf/bpftool/map_perf_ring.c @@ -188,10 +188,9 @@ int do_event_pipe(int argc, char **argv) opts.map_keys = &ctx.idx; pb = perf_buffer__new_raw(map_fd, MMAP_PAGE_CNT, &perf_attr, print_bpf_output, &ctx, &opts); - err = libbpf_get_error(pb); - if (err) { + if (!pb) { p_err("failed to create perf buffer: %s (%d)", - strerror(err), err); + strerror(errno), errno); goto err_close_map; } @@ -206,7 +205,7 @@ int do_event_pipe(int argc, char **argv) err = perf_buffer__poll(pb, 200); if (err < 0 && err != -EINTR) { p_err("perf buffer polling failed: %s (%d)", - strerror(err), err); + strerror(errno), errno); goto err_close_pb; } } From 5f388bba7acbdb097a9e7ed932a39b40f7eb2acf Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 29 Sep 2022 11:01:33 +0200 Subject: [PATCH 140/143] selftests/xsk: Fix double free Fix a double free at exit of the test suite. Fixes: a693ff3ed561 ("selftests/xsk: Add support for executing tests on physical device") Signed-off-by: Magnus Karlsson Signed-off-by: Andrii Nakryiko Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20220929090133.7869-1-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xskxceiver.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index ef33309bbe49..d1a5f3218c34 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -1953,9 +1953,6 @@ int main(int argc, char **argv) pkt_stream_delete(tx_pkt_stream_default); pkt_stream_delete(rx_pkt_stream_default); - free(ifobj_rx->umem); - if (!ifobj_tx->shared_umem) - free(ifobj_tx->umem); ifobject_delete(ifobj_tx); ifobject_delete(ifobj_rx); From b502a6fb46d275aa978c1e0655bada2cafc81fea Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 1 Oct 2022 08:49:45 -0700 Subject: [PATCH 141/143] bpf, docs: Delete misformatted table. Delete misformatted table. Fixes: 6166da0a02cd ("bpf, docs: Move legacy packet instructions to a separate file") Signed-off-by: Alexei Starovoitov --- Documentation/bpf/linux-notes.rst | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/Documentation/bpf/linux-notes.rst b/Documentation/bpf/linux-notes.rst index 1c31379b469f..956b0c86699d 100644 --- a/Documentation/bpf/linux-notes.rst +++ b/Documentation/bpf/linux-notes.rst @@ -51,20 +51,3 @@ where ``ntohl()`` converts a 32-bit value from network byte order to host byte o ``BPF_IND | BPF_W | BPF_LD`` (0x40) means:: R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + src + imm)) - -Appendix -======== - -For reference, the following table lists legacy Linux-specific opcodes in order by value. - -====== ==== =================================================== ============= -opcode imm description reference -====== ==== =================================================== ============= -0x20 any dst = ntohl(\*(uint32_t \*)(R6->data + imm)) `Legacy BPF Packet access instructions`_ -0x28 any dst = ntohs(\*(uint16_t \*)(R6->data + imm)) `Legacy BPF Packet access instructions`_ -0x30 any dst = (\*(uint8_t \*)(R6->data + imm)) `Legacy BPF Packet access instructions`_ -0x38 any dst = ntohll(\*(uint64_t \*)(R6->data + imm)) `Legacy BPF Packet access instructions`_ -0x40 any dst = ntohl(\*(uint32_t \*)(R6->data + src + imm)) `Legacy BPF Packet access instructions`_ -0x48 any dst = ntohs(\*(uint16_t \*)(R6->data + src + imm)) `Legacy BPF Packet access instructions`_ -0x50 any dst = \*(uint8_t \*)(R6->data + src + imm)) `Legacy BPF Packet access instructions`_ -0x58 any dst = ntohll(\*(uint64_t \*)(R6->data + src + imm)) `Legacy BPF Packet access instructions`_ From 736baae643cb1ccaeaf989ef8eb09ce085020479 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Sun, 2 Oct 2022 10:20:23 +0700 Subject: [PATCH 142/143] Documentation: bpf: Add implementation notes documentations to table of contents Sphinx reported warnings on missing implementation notes documentations in the table of contents: Documentation/bpf/clang-notes.rst: WARNING: document isn't included in any toctree Documentation/bpf/linux-notes.rst: WARNING: document isn't included in any toctree Add these documentations to the table of contents (index.rst) of BPF documentation to fix the warnings. Link: https://lore.kernel.org/linux-doc/202210020749.yfgDZbRL-lkp@intel.com/ Fixes: 6c7aaffb24efbd ("bpf, docs: Move Clang notes to a separate file") Fixes: 6166da0a02cde2 ("bpf, docs: Move legacy packet instructions to a separate file") Reported-by: kernel test robot Signed-off-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20221002032022.24693-1-bagasdotme@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/index.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 1bc2c5c58bdb..1b50de1983ee 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -26,6 +26,8 @@ that goes into great technical depth about the BPF Architecture. classic_vs_extended.rst bpf_licensing test_debug + clang-notes + linux-notes other .. only:: subproject and html From 820dc0523e05c12810bb6bf4e56ce26e4c1948a2 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 30 Sep 2022 00:38:43 +0200 Subject: [PATCH 143/143] net: netfilter: move bpf_ct_set_nat_info kfunc in nf_nat_bpf.c Remove circular dependency between nf_nat module and nf_conntrack one moving bpf_ct_set_nat_info kfunc in nf_nat_bpf.c Fixes: 0fabd2aa199f ("net: netfilter: add bpf_ct_set_nat_info kfunc helper") Suggested-by: Kumar Kartikeya Dwivedi Tested-by: Nathan Chancellor Tested-by: Yauheni Kaliuta Signed-off-by: Lorenzo Bianconi Acked-by: John Fastabend Link: https://lore.kernel.org/r/51a65513d2cda3eeb0754842e8025ab3966068d8.1664490511.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/netfilter/nf_conntrack_bpf.h | 19 ++++++ net/netfilter/Makefile | 6 ++ net/netfilter/nf_conntrack_bpf.c | 50 --------------- net/netfilter/nf_nat_bpf.c | 79 ++++++++++++++++++++++++ net/netfilter/nf_nat_core.c | 4 +- 5 files changed, 106 insertions(+), 52 deletions(-) create mode 100644 net/netfilter/nf_nat_bpf.c diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index c8b80add1142..2d0da478c8e0 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -4,6 +4,11 @@ #define _NF_CONNTRACK_BPF_H #include +#include + +struct nf_conn___init { + struct nf_conn ct; +}; #if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) @@ -24,4 +29,18 @@ static inline void cleanup_nf_conntrack_bpf(void) #endif +#if (IS_BUILTIN(CONFIG_NF_NAT) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_NF_NAT) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +extern int register_nf_nat_bpf(void); + +#else + +static inline int register_nf_nat_bpf(void) +{ + return 0; +} + +#endif + #endif /* _NF_CONNTRACK_BPF_H */ diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 06df49ea6329..0f060d100880 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -60,6 +60,12 @@ obj-$(CONFIG_NF_NAT) += nf_nat.o nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o +ifeq ($(CONFIG_NF_NAT),m) +nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o +else ifeq ($(CONFIG_NF_NAT),y) +nf_nat-$(CONFIG_DEBUG_INFO_BTF) += nf_nat_bpf.o +endif + # NAT helpers obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 756ea818574e..8639e7efd0e2 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -14,10 +14,8 @@ #include #include #include -#include #include #include -#include /* bpf_ct_opts - Options for CT lookup helpers * @@ -239,10 +237,6 @@ __diag_push(); __diag_ignore_all("-Wmissing-prototypes", "Global functions as their definitions will be in nf_conntrack BTF"); -struct nf_conn___init { - struct nf_conn ct; -}; - /* bpf_xdp_ct_alloc - Allocate a new CT entry * * Parameters: @@ -476,49 +470,6 @@ int bpf_ct_change_status(struct nf_conn *nfct, u32 status) return nf_ct_change_status_common(nfct, status); } -/* bpf_ct_set_nat_info - Set source or destination nat address - * - * Set source or destination nat address of the newly allocated - * nf_conn before insertion. This must be invoked for referenced - * PTR_TO_BTF_ID to nf_conn___init. - * - * Parameters: - * @nfct - Pointer to referenced nf_conn object, obtained using - * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. - * @addr - Nat source/destination address - * @port - Nat source/destination port. Non-positive values are - * interpreted as select a random port. - * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST - */ -int bpf_ct_set_nat_info(struct nf_conn___init *nfct, - union nf_inet_addr *addr, int port, - enum nf_nat_manip_type manip) -{ -#if ((IS_MODULE(CONFIG_NF_NAT) && IS_MODULE(CONFIG_NF_CONNTRACK)) || \ - IS_BUILTIN(CONFIG_NF_NAT)) - struct nf_conn *ct = (struct nf_conn *)nfct; - u16 proto = nf_ct_l3num(ct); - struct nf_nat_range2 range; - - if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6) - return -EINVAL; - - memset(&range, 0, sizeof(struct nf_nat_range2)); - range.flags = NF_NAT_RANGE_MAP_IPS; - range.min_addr = *addr; - range.max_addr = range.min_addr; - if (port > 0) { - range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - range.min_proto.all = cpu_to_be16(port); - range.max_proto.all = range.min_proto.all; - } - - return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; -#else - return -EOPNOTSUPP; -#endif -} - __diag_pop() BTF_SET8_START(nf_ct_kfunc_set) @@ -532,7 +483,6 @@ BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) BTF_SET8_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c new file mode 100644 index 000000000000..0fa5a0bbb0ff --- /dev/null +++ b/net/netfilter/nf_nat_bpf.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable NAT Helpers for XDP and TC-BPF hook + * + * These are called from the XDP and SCHED_CLS BPF programs. Note that it is + * allowed to break compatibility for these functions since the interface they + * are exposed through to BPF programs is explicitly unstable. + */ + +#include +#include +#include +#include +#include + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in nf_nat BTF"); + +/* bpf_ct_set_nat_info - Set source or destination nat address + * + * Set source or destination nat address of the newly allocated + * nf_conn before insertion. This must be invoked for referenced + * PTR_TO_BTF_ID to nf_conn___init. + * + * Parameters: + * @nfct - Pointer to referenced nf_conn object, obtained using + * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. + * @addr - Nat source/destination address + * @port - Nat source/destination port. Non-positive values are + * interpreted as select a random port. + * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST + */ +int bpf_ct_set_nat_info(struct nf_conn___init *nfct, + union nf_inet_addr *addr, int port, + enum nf_nat_manip_type manip) +{ + struct nf_conn *ct = (struct nf_conn *)nfct; + u16 proto = nf_ct_l3num(ct); + struct nf_nat_range2 range; + + if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6) + return -EINVAL; + + memset(&range, 0, sizeof(struct nf_nat_range2)); + range.flags = NF_NAT_RANGE_MAP_IPS; + range.min_addr = *addr; + range.max_addr = range.min_addr; + if (port > 0) { + range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + range.min_proto.all = cpu_to_be16(port); + range.max_proto.all = range.min_proto.all; + } + + return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; +} + +__diag_pop() + +BTF_SET8_START(nf_nat_kfunc_set) +BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) +BTF_SET8_END(nf_nat_kfunc_set) + +static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { + .owner = THIS_MODULE, + .set = &nf_nat_kfunc_set, +}; + +int register_nf_nat_bpf(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, + &nf_bpf_nat_kfunc_set); + if (ret) + return ret; + + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, + &nf_bpf_nat_kfunc_set); +} diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7981be526f26..d8e6380f6337 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include #include @@ -1152,7 +1152,7 @@ static int __init nf_nat_init(void) WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); - return 0; + return register_nf_nat_bpf(); } static void __exit nf_nat_cleanup(void)