From c453312857ba41129db3558f5428405bbbb8f1a4 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 9 Dec 2019 11:17:30 +0000 Subject: [PATCH 001/127] ARM: net: bpf: Improve prologue code sequence Improve the prologue code sequence to be able to take advantage of 64-bit stores, changing the code from: push {r4, r5, r6, r7, r8, r9, fp, lr} mov fp, sp sub ip, sp, #80 ; 0x50 sub sp, sp, #600 ; 0x258 str ip, [fp, #-100] ; 0xffffff9c mov r6, #0 str r6, [fp, #-96] ; 0xffffffa0 mov r4, #0 mov r3, r4 mov r2, r0 str r4, [fp, #-104] ; 0xffffff98 str r4, [fp, #-108] ; 0xffffff94 to the tighter: push {r4, r5, r6, r7, r8, r9, fp, lr} mov fp, sp mov r3, #0 sub r2, sp, #80 ; 0x50 sub sp, sp, #600 ; 0x258 strd r2, [fp, #-100] ; 0xffffff9c mov r2, #0 strd r2, [fp, #-108] ; 0xffffff94 mov r2, r0 resulting in a saving of three instructions. Signed-off-by: Russell King Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/E1ieH2g-0004ih-Rb@rmk-PC.armlinux.org.uk --- arch/arm/net/bpf_jit_32.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 97dc386e3cb8..cc29869d12a3 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1260,12 +1260,9 @@ static inline void emit_push_r64(const s8 src[], struct jit_ctx *ctx) static void build_prologue(struct jit_ctx *ctx) { - const s8 r0 = bpf2a32[BPF_REG_0][1]; - const s8 r2 = bpf2a32[BPF_REG_1][1]; - const s8 r3 = bpf2a32[BPF_REG_1][0]; - const s8 r4 = bpf2a32[BPF_REG_6][1]; - const s8 fplo = bpf2a32[BPF_REG_FP][1]; - const s8 fphi = bpf2a32[BPF_REG_FP][0]; + const s8 arm_r0 = bpf2a32[BPF_REG_0][1]; + const s8 *bpf_r1 = bpf2a32[BPF_REG_1]; + const s8 *bpf_fp = bpf2a32[BPF_REG_FP]; const s8 *tcc = bpf2a32[TCALL_CNT]; /* Save callee saved registers. */ @@ -1278,8 +1275,10 @@ static void build_prologue(struct jit_ctx *ctx) emit(ARM_PUSH(CALLEE_PUSH_MASK), ctx); emit(ARM_MOV_R(ARM_FP, ARM_SP), ctx); #endif - /* Save frame pointer for later */ - emit(ARM_SUB_I(ARM_IP, ARM_SP, SCRATCH_SIZE), ctx); + /* mov r3, #0 */ + /* sub r2, sp, #SCRATCH_SIZE */ + emit(ARM_MOV_I(bpf_r1[0], 0), ctx); + emit(ARM_SUB_I(bpf_r1[1], ARM_SP, SCRATCH_SIZE), ctx); ctx->stack_size = imm8m(STACK_SIZE); @@ -1287,18 +1286,15 @@ static void build_prologue(struct jit_ctx *ctx) emit(ARM_SUB_I(ARM_SP, ARM_SP, ctx->stack_size), ctx); /* Set up BPF prog stack base register */ - emit_a32_mov_r(fplo, ARM_IP, ctx); - emit_a32_mov_i(fphi, 0, ctx); + emit_a32_mov_r64(true, bpf_fp, bpf_r1, ctx); - /* mov r4, 0 */ - emit(ARM_MOV_I(r4, 0), ctx); + /* Initialize Tail Count */ + emit(ARM_MOV_I(bpf_r1[1], 0), ctx); + emit_a32_mov_r64(true, tcc, bpf_r1, ctx); /* Move BPF_CTX to BPF_R1 */ - emit(ARM_MOV_R(r3, r4), ctx); - emit(ARM_MOV_R(r2, r0), ctx); - /* Initialize Tail Count */ - emit(ARM_STR_I(r4, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(tcc[0])), ctx); - emit(ARM_STR_I(r4, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(tcc[1])), ctx); + emit(ARM_MOV_R(bpf_r1[1], arm_r0), ctx); + /* end of prologue */ } From 09c4708d3cf42b2b2211b1aee82e7baf446a24e8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Dec 2019 14:40:22 -0800 Subject: [PATCH 002/127] libbpf: Bump libpf current version to v0.0.7 New development cycles starts, bump to v0.0.7 proactively. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191209224022.3544519-1-andriin@fb.com --- tools/lib/bpf/libbpf.map | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 8ddc2c40e482..495df575f87f 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -208,3 +208,6 @@ LIBBPF_0.0.6 { btf__find_by_name_kind; libbpf_find_vmlinux_btf_id; } LIBBPF_0.0.5; + +LIBBPF_0.0.7 { +} LIBBPF_0.0.6; From b590cb5f802dc20c72f507f7fbe6737222d0afba Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 10 Dec 2019 11:19:33 -0800 Subject: [PATCH 003/127] bpf: Switch to offsetofend in BPF_PROG_TEST_RUN Switch existing pattern of "offsetof(..., member) + FIELD_SIZEOF(..., member)' to "offsetofend(..., member)" which does exactly what we need without all the copy-paste. Suggested-by: Andrii Nakryiko Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191210191933.105321-1-sdf@google.com --- net/bpf/test_run.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 915c2d6f7fb9..85c8cbbada92 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -252,22 +252,19 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) /* priority is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) + - FIELD_SIZEOF(struct __sk_buff, priority), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority), offsetof(struct __sk_buff, cb))) return -EINVAL; /* cb is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + - FIELD_SIZEOF(struct __sk_buff, cb), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), offsetof(struct __sk_buff, tstamp))) return -EINVAL; /* tstamp is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, tstamp) + - FIELD_SIZEOF(struct __sk_buff, tstamp), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, tstamp), sizeof(struct __sk_buff))) return -EINVAL; @@ -437,8 +434,7 @@ static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) /* flags is allowed */ - if (!range_is_zero(ctx, offsetof(struct bpf_flow_keys, flags) + - FIELD_SIZEOF(struct bpf_flow_keys, flags), + if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags), sizeof(struct bpf_flow_keys))) return -EINVAL; From bae141f54be83b06652c1d47e50e4e75ed4e9c7e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 6 Dec 2019 22:49:34 +0100 Subject: [PATCH 004/127] bpf: Emit audit messages upon successful prog load and unload Allow for audit messages to be emitted upon BPF program load and unload for having a timeline of events. The load itself is in syscall context, so additional info about the process initiating the BPF prog creation can be logged and later directly correlated to the unload event. The only info really needed from BPF side is the globally unique prog ID where then audit user space tooling can query / dump all info needed about the specific BPF program right upon load event and enrich the record, thus these changes needed here can be kept small and non-intrusive to the core. Raw example output: # auditctl -D # auditctl -a always,exit -F arch=x86_64 -S bpf # ausearch --start recent -m 1334 ... ---- time->Wed Nov 27 16:04:13 2019 type=PROCTITLE msg=audit(1574867053.120:84664): proctitle="./bpf" type=SYSCALL msg=audit(1574867053.120:84664): arch=c000003e syscall=321 \ success=yes exit=3 a0=5 a1=7ffea484fbe0 a2=70 a3=0 items=0 ppid=7477 \ pid=12698 auid=1001 uid=1001 gid=1001 euid=1001 suid=1001 fsuid=1001 \ egid=1001 sgid=1001 fsgid=1001 tty=pts2 ses=4 comm="bpf" \ exe="/home/jolsa/auditd/audit-testsuite/tests/bpf/bpf" \ subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1334] msg=audit(1574867053.120:84664): prog-id=76 op=LOAD ---- time->Wed Nov 27 16:04:13 2019 type=UNKNOWN[1334] msg=audit(1574867053.120:84665): prog-id=76 op=UNLOAD ... Signed-off-by: Daniel Borkmann Co-developed-by: Jiri Olsa Signed-off-by: Jiri Olsa Acked-by: Paul Moore Link: https://lore.kernel.org/bpf/20191206214934.11319-1-jolsa@kernel.org --- include/uapi/linux/audit.h | 1 + kernel/bpf/syscall.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 3ad935527177..a534d71e689a 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -116,6 +116,7 @@ #define AUDIT_FANOTIFY 1331 /* Fanotify access decision */ #define AUDIT_TIME_INJOFFSET 1332 /* Timekeeping offset injected */ #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ +#define AUDIT_BPF 1334 /* BPF subsystem */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e3461ec59570..66b90eaf99fe 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -1306,6 +1307,36 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return 0; } +enum bpf_audit { + BPF_AUDIT_LOAD, + BPF_AUDIT_UNLOAD, + BPF_AUDIT_MAX, +}; + +static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { + [BPF_AUDIT_LOAD] = "LOAD", + [BPF_AUDIT_UNLOAD] = "UNLOAD", +}; + +static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) +{ + struct audit_context *ctx = NULL; + struct audit_buffer *ab; + + if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) + return; + if (audit_enabled == AUDIT_OFF) + return; + if (op == BPF_AUDIT_LOAD) + ctx = audit_context(); + ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); + if (unlikely(!ab)) + return; + audit_log_format(ab, "prog-id=%u op=%s", + prog->aux->id, bpf_audit_str[op]); + audit_log_end(ab); +} + int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1421,6 +1452,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); __bpf_prog_put_noref(prog, true); @@ -1830,6 +1862,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) From 81c22041d9f19df07b9cba95e3cd02e0f41bc1e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 9 Dec 2019 16:08:03 +0100 Subject: [PATCH 005/127] bpf, x86, arm64: Enable jit by default when not built as always-on After Spectre 2 fix via 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config") most major distros use BPF_JIT_ALWAYS_ON configuration these days which compiles out the BPF interpreter entirely and always enables the JIT. Also given recent fix in e1608f3fa857 ("bpf: Avoid setting bpf insns pages read-only when prog is jited"), we additionally avoid fragmenting the direct map for the BPF insns pages sitting in the general data heap since they are not used during execution. Latter is only needed when run through the interpreter. Since both x86 and arm64 JITs have seen a lot of exposure over the years, are generally most up to date and maintained, there is more downside in !BPF_JIT_ALWAYS_ON configurations to have the interpreter enabled by default rather than the JIT. Add a ARCH_WANT_DEFAULT_BPF_JIT config which archs can use to set the bpf_jit_{enable,kallsyms} to 1. Back in the days the bpf_jit_kallsyms knob was set to 0 by default since major distros still had /proc/kallsyms addresses exposed to unprivileged user space which is not the case anymore. Hence both knobs are set via BPF_JIT_DEFAULT_ON which is set to 'y' in case of BPF_JIT_ALWAYS_ON or ARCH_WANT_DEFAULT_BPF_JIT. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Will Deacon Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/f78ad24795c2966efcc2ee19025fa3459f622185.1575903816.git.daniel@iogearbox.net --- arch/arm64/Kconfig | 1 + arch/x86/Kconfig | 1 + init/Kconfig | 7 +++++++ kernel/bpf/core.c | 4 ++-- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b1b4476ddb83..29d03459de20 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -69,6 +69,7 @@ config ARM64 select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG) select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5e8949953660..1f6a0388a65f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,7 @@ config X86 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANTS_THP_SWAP if X86_64 diff --git a/init/Kconfig b/init/Kconfig index a34064a031a5..890aaa62efde 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1604,6 +1604,9 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. +config ARCH_WANT_DEFAULT_BPF_JIT + bool + config BPF_JIT_ALWAYS_ON bool "Permanently enable BPF JIT and remove BPF interpreter" depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT @@ -1611,6 +1614,10 @@ config BPF_JIT_ALWAYS_ON Enables BPF JIT and removes BPF interpreter to avoid speculative execution of BPF instructions by the interpreter +config BPF_JIT_DEFAULT_ON + def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON + depends on HAVE_EBPF_JIT && BPF_JIT + config USERFAULTFD bool "Enable userfaultfd() system call" depends on MMU diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 49e32acad7d8..2ff01a716128 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -520,9 +520,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ -int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); +int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; -int bpf_jit_kallsyms __read_mostly; long bpf_jit_limit __read_mostly; static __always_inline void From 679152d3a32e305c213f83160c328c37566ae8bc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 12 Dec 2019 09:19:18 -0800 Subject: [PATCH 006/127] libbpf: Fix printf compilation warnings on ppc64le arch On ppc64le __u64 and __s64 are defined as long int and unsigned long int, respectively. This causes compiler to emit warning when %lld/%llu are used to printf 64-bit numbers. Fix this by casting to size_t/ssize_t with %zu and %zd format specifiers, respectively. v1->v2: - use size_t/ssize_t instead of custom typedefs (Martin). Fixes: 1f8e2bcb2cd5 ("libbpf: Refactor relocation handling") Fixes: abd29c931459 ("libbpf: allow specifying map definitions using BTF") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191212171918.638010-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3f09772192f1..920d4e06a5f9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1242,15 +1242,15 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, } sz = btf__resolve_size(obj->btf, t->type); if (sz < 0) { - pr_warn("map '%s': can't determine key size for type [%u]: %lld.\n", - map_name, t->type, sz); + pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found key [%u], sz = %lld.\n", - map_name, t->type, sz); + pr_debug("map '%s': found key [%u], sz = %zd.\n", + map_name, t->type, (ssize_t)sz); if (map->def.key_size && map->def.key_size != sz) { - pr_warn("map '%s': conflicting key size %u != %lld.\n", - map_name, map->def.key_size, sz); + pr_warn("map '%s': conflicting key size %u != %zd.\n", + map_name, map->def.key_size, (ssize_t)sz); return -EINVAL; } map->def.key_size = sz; @@ -1285,15 +1285,15 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, } sz = btf__resolve_size(obj->btf, t->type); if (sz < 0) { - pr_warn("map '%s': can't determine value size for type [%u]: %lld.\n", - map_name, t->type, sz); + pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found value [%u], sz = %lld.\n", - map_name, t->type, sz); + pr_debug("map '%s': found value [%u], sz = %zd.\n", + map_name, t->type, (ssize_t)sz); if (map->def.value_size && map->def.value_size != sz) { - pr_warn("map '%s': conflicting value size %u != %lld.\n", - map_name, map->def.value_size, sz); + pr_warn("map '%s': conflicting value size %u != %zd.\n", + map_name, map->def.value_size, (ssize_t)sz); return -EINVAL; } map->def.value_size = sz; @@ -1817,7 +1817,8 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return -LIBBPF_ERRNO__RELOC; } if (sym->st_value % 8) { - pr_warn("bad call relo offset: %llu\n", (__u64)sym->st_value); + pr_warn("bad call relo offset: %zu\n", + (size_t)sym->st_value); return -LIBBPF_ERRNO__RELOC; } reloc_desc->type = RELO_CALL; @@ -1859,8 +1860,8 @@ static int bpf_program__record_reloc(struct bpf_program *prog, break; } if (map_idx >= nr_maps) { - pr_warn("map relo failed to find map for sec %u, off %llu\n", - shdr_idx, (__u64)sym->st_value); + pr_warn("map relo failed to find map for sec %u, off %zu\n", + shdr_idx, (size_t)sym->st_value); return -LIBBPF_ERRNO__RELOC; } reloc_desc->type = RELO_LD64; @@ -1941,9 +1942,9 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, sym.st_name) ? : ""; - pr_debug("relo for shdr %u, symb %llu, value %llu, type %d, bind %d, name %d (\'%s\'), insn %u\n", - (__u32)sym.st_shndx, (__u64)GELF_R_SYM(rel.r_info), - (__u64)sym.st_value, GELF_ST_TYPE(sym.st_info), + pr_debug("relo for shdr %u, symb %zu, value %zu, type %d, bind %d, name %d (\'%s\'), insn %u\n", + (__u32)sym.st_shndx, (size_t)GELF_R_SYM(rel.r_info), + (size_t)sym.st_value, GELF_ST_TYPE(sym.st_info), GELF_ST_BIND(sym.st_info), sym.st_name, name, insn_idx); From 67d69ccdf389f3b7fabb00e2d81473a915b3b64d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 12 Dec 2019 11:22:50 +0100 Subject: [PATCH 007/127] libbpf: Recognize SK_REUSEPORT programs from section name Allow loading BPF object files that contain SK_REUSEPORT programs without having to manually set the program type before load if the the section name is set to "sk_reuseport". Makes user-space code needed to load SK_REUSEPORT BPF program more concise. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212102259.418536-2-jakub@cloudflare.com --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 920d4e06a5f9..b99c0a9b7756 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4983,6 +4983,7 @@ static const struct { enum bpf_attach_type attach_type; } section_names[] = { BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), + BPF_PROG_SEC("sk_reuseport", BPF_PROG_TYPE_SK_REUSEPORT), BPF_PROG_SEC("kprobe/", BPF_PROG_TYPE_KPROBE), BPF_PROG_SEC("uprobe/", BPF_PROG_TYPE_KPROBE), BPF_PROG_SEC("kretprobe/", BPF_PROG_TYPE_KPROBE), From 1fbcef929d7d71321745e61e8c20b0a02bd38cf1 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 12 Dec 2019 11:22:51 +0100 Subject: [PATCH 008/127] selftests/bpf: Let libbpf determine program type from section name Now that libbpf can recognize SK_REUSEPORT programs, we no longer have to pass a prog_type hint before loading the object file. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212102259.418536-3-jakub@cloudflare.com --- .../selftests/bpf/progs/test_select_reuseport_kern.c | 2 +- tools/testing/selftests/bpf/test_select_reuseport.c | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c index ea7d84f01235..b1f09f5bb1cf 100644 --- a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c +++ b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c @@ -62,7 +62,7 @@ struct { goto done; \ }) -SEC("select_by_skb_data") +SEC("sk_reuseport") int _select_by_skb_data(struct sk_reuseport_md *reuse_md) { __u32 linum, index = 0, flags = 0, index_zero = 0; diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c index 7566c13eb51a..1e3cfe1cb28a 100644 --- a/tools/testing/selftests/bpf/test_select_reuseport.c +++ b/tools/testing/selftests/bpf/test_select_reuseport.c @@ -87,19 +87,11 @@ static void prepare_bpf_obj(void) struct bpf_program *prog; struct bpf_map *map; int err; - struct bpf_object_open_attr attr = { - .file = "test_select_reuseport_kern.o", - .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, - }; - obj = bpf_object__open_xattr(&attr); + obj = bpf_object__open("test_select_reuseport_kern.o"); CHECK(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o", "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj)); - prog = bpf_program__next(NULL, obj); - CHECK(!prog, "get first bpf_program", "!prog\n"); - bpf_program__set_type(prog, attr.prog_type); - map = bpf_object__find_map_by_name(obj, "outer_map"); CHECK(!map, "find outer_map", "!map\n"); err = bpf_map__reuse_fd(map, outer_map); @@ -108,6 +100,8 @@ static void prepare_bpf_obj(void) err = bpf_object__load(obj); CHECK(err, "load bpf_object", "err:%d\n", err); + prog = bpf_program__next(NULL, obj); + CHECK(!prog, "get first bpf_program", "!prog\n"); select_by_skb_data_prog = bpf_program__fd(prog); CHECK(select_by_skb_data_prog == -1, "get prog fd", "select_by_skb_data_prog:%d\n", select_by_skb_data_prog); From 11f80355d4d27c7c798fc9890b4056c5d98af992 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 12 Dec 2019 11:22:52 +0100 Subject: [PATCH 009/127] selftests/bpf: Use sa_family_t everywhere in reuseport tests Update the only function that is not using sa_family_t in this source file. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212102259.418536-4-jakub@cloudflare.com --- tools/testing/selftests/bpf/test_select_reuseport.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c index 1e3cfe1cb28a..a295a087a026 100644 --- a/tools/testing/selftests/bpf/test_select_reuseport.c +++ b/tools/testing/selftests/bpf/test_select_reuseport.c @@ -643,7 +643,7 @@ static void prepare_sk_fds(int type, sa_family_t family, bool inany) } } -static void setup_per_test(int type, unsigned short family, bool inany) +static void setup_per_test(int type, sa_family_t family, bool inany) { int ovr = -1, err; @@ -680,12 +680,12 @@ static void test_all(void) const int types[] = { SOCK_STREAM, SOCK_DGRAM, SOCK_STREAM }; const char * const type_strings[] = { "TCP", "UDP", "TCP" }; const char * const family_strings[] = { "IPv6", "IPv4" }; - const unsigned short families[] = { AF_INET6, AF_INET }; + const sa_family_t families[] = { AF_INET6, AF_INET }; const bool bind_inany[] = { false, false, true }; int t, f, err; for (f = 0; f < ARRAY_SIZE(families); f++) { - unsigned short family = families[f]; + sa_family_t family = families[f]; for (t = 0; t < ARRAY_SIZE(types); t++) { bool inany = bind_inany[t]; From a9ce4cf4e47c55ab40641b92551bc9e1de4bfa61 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 12 Dec 2019 11:22:53 +0100 Subject: [PATCH 010/127] selftests/bpf: Add helpers for getting socket family & type name Having string arrays to map socket family & type to a name prevents us from unrolling the test runner loop in the subsequent patch. Introduce helpers that do the same thing. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212102259.418536-5-jakub@cloudflare.com --- .../selftests/bpf/test_select_reuseport.c | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c index a295a087a026..ef98a7de6704 100644 --- a/tools/testing/selftests/bpf/test_select_reuseport.c +++ b/tools/testing/selftests/bpf/test_select_reuseport.c @@ -674,12 +674,34 @@ static void cleanup(void) bpf_object__close(obj); } +static const char *family_str(sa_family_t family) +{ + switch (family) { + case AF_INET: + return "IPv4"; + case AF_INET6: + return "IPv6"; + default: + return "unknown"; + } +} + +static const char *sotype_str(int sotype) +{ + switch (sotype) { + case SOCK_STREAM: + return "TCP"; + case SOCK_DGRAM: + return "UDP"; + default: + return "unknown"; + } +} + static void test_all(void) { /* Extra SOCK_STREAM to test bind_inany==true */ const int types[] = { SOCK_STREAM, SOCK_DGRAM, SOCK_STREAM }; - const char * const type_strings[] = { "TCP", "UDP", "TCP" }; - const char * const family_strings[] = { "IPv6", "IPv4" }; const sa_family_t families[] = { AF_INET6, AF_INET }; const bool bind_inany[] = { false, false, true }; int t, f, err; @@ -692,7 +714,7 @@ static void test_all(void) int type = types[t]; printf("######## %s/%s %s ########\n", - family_strings[f], type_strings[t], + family_str(family), sotype_str(type), inany ? " INANY " : "LOOPBACK"); setup_per_test(type, family, inany); From 9936338258308aa008387dc4f9fd3f5fec6c046a Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 12 Dec 2019 11:22:54 +0100 Subject: [PATCH 011/127] selftests/bpf: Unroll the main loop in reuseport test Prepare for iterating over individual tests without introducing another nested loop in the main test function. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212102259.418536-6-jakub@cloudflare.com --- .../selftests/bpf/test_select_reuseport.c | 85 ++++++++++--------- 1 file changed, 47 insertions(+), 38 deletions(-) diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c index ef98a7de6704..63ce2e75e758 100644 --- a/tools/testing/selftests/bpf/test_select_reuseport.c +++ b/tools/testing/selftests/bpf/test_select_reuseport.c @@ -698,47 +698,56 @@ static const char *sotype_str(int sotype) } } +static void test_config(int type, sa_family_t family, bool inany) +{ + int err; + + printf("######## %s/%s %s ########\n", + family_str(family), sotype_str(type), + inany ? " INANY " : "LOOPBACK"); + + setup_per_test(type, family, inany); + + test_err_inner_map(type, family); + + /* Install reuseport_array to the outer_map */ + err = bpf_map_update_elem(outer_map, &index_zero, + &reuseport_array, BPF_ANY); + CHECK(err == -1, "update_elem(outer_map)", + "err:%d errno:%d\n", err, errno); + + test_err_skb_data(type, family); + test_err_sk_select_port(type, family); + test_pass(type, family); + test_syncookie(type, family); + test_pass_on_err(type, family); + /* Must be the last test */ + test_detach_bpf(type, family); + + cleanup_per_test(); + printf("\n"); +} + +#define BIND_INANY true + static void test_all(void) { - /* Extra SOCK_STREAM to test bind_inany==true */ - const int types[] = { SOCK_STREAM, SOCK_DGRAM, SOCK_STREAM }; - const sa_family_t families[] = { AF_INET6, AF_INET }; - const bool bind_inany[] = { false, false, true }; - int t, f, err; + const struct config { + int sotype; + sa_family_t family; + bool inany; + } configs[] = { + { SOCK_STREAM, AF_INET }, + { SOCK_STREAM, AF_INET, BIND_INANY }, + { SOCK_STREAM, AF_INET6 }, + { SOCK_STREAM, AF_INET6, BIND_INANY }, + { SOCK_DGRAM, AF_INET }, + { SOCK_DGRAM, AF_INET6 }, + }; + const struct config *c; - for (f = 0; f < ARRAY_SIZE(families); f++) { - sa_family_t family = families[f]; - - for (t = 0; t < ARRAY_SIZE(types); t++) { - bool inany = bind_inany[t]; - int type = types[t]; - - printf("######## %s/%s %s ########\n", - family_str(family), sotype_str(type), - inany ? " INANY " : "LOOPBACK"); - - setup_per_test(type, family, inany); - - test_err_inner_map(type, family); - - /* Install reuseport_array to the outer_map */ - err = bpf_map_update_elem(outer_map, &index_zero, - &reuseport_array, BPF_ANY); - CHECK(err == -1, "update_elem(outer_map)", - "err:%d errno:%d\n", err, errno); - - test_err_skb_data(type, family); - test_err_sk_select_port(type, family); - test_pass(type, family); - test_syncookie(type, family); - test_pass_on_err(type, family); - /* Must be the last test */ - test_detach_bpf(type, family); - - cleanup_per_test(); - printf("\n"); - } - } + for (c = configs; c < configs + ARRAY_SIZE(configs); c++) + test_config(c->sotype, c->family, c->inany); } int main(int argc, const char **argv) From ce7cb5f3921cdf0f65ab877764265c6c0be34c3c Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 12 Dec 2019 11:22:55 +0100 Subject: [PATCH 012/127] selftests/bpf: Run reuseport tests in a loop Prepare for switching reuseport tests to test_progs framework. Loop over the tests and perform setup/cleanup for each test separately, remembering that with test_progs we can select tests to run. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212102259.418536-7-jakub@cloudflare.com --- .../selftests/bpf/test_select_reuseport.c | 55 ++++++++++++------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c index 63ce2e75e758..cfff958da570 100644 --- a/tools/testing/selftests/bpf/test_select_reuseport.c +++ b/tools/testing/selftests/bpf/test_select_reuseport.c @@ -643,7 +643,8 @@ static void prepare_sk_fds(int type, sa_family_t family, bool inany) } } -static void setup_per_test(int type, sa_family_t family, bool inany) +static void setup_per_test(int type, sa_family_t family, bool inany, + bool no_inner_map) { int ovr = -1, err; @@ -652,9 +653,18 @@ static void setup_per_test(int type, sa_family_t family, bool inany) BPF_ANY); CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)", "err:%d errno:%d\n", err, errno); + + /* Install reuseport_array to outer_map? */ + if (no_inner_map) + return; + + err = bpf_map_update_elem(outer_map, &index_zero, &reuseport_array, + BPF_ANY); + CHECK(err == -1, "update_elem(outer_map, 0, reuseport_array)", + "err:%d errno:%d\n", err, errno); } -static void cleanup_per_test(void) +static void cleanup_per_test(bool no_inner_map) { int i, err; @@ -662,6 +672,10 @@ static void cleanup_per_test(void) close(sk_fds[i]); close(epfd); + /* Delete reuseport_array from outer_map? */ + if (no_inner_map) + return; + err = bpf_map_delete_elem(outer_map, &index_zero); CHECK(err == -1, "delete_elem(outer_map)", "err:%d errno:%d\n", err, errno); @@ -700,31 +714,30 @@ static const char *sotype_str(int sotype) static void test_config(int type, sa_family_t family, bool inany) { - int err; + const struct test { + void (*fn)(int sotype, sa_family_t family); + bool no_inner_map; + } tests[] = { + { test_err_inner_map, true /* no_inner_map */ }, + { test_err_skb_data }, + { test_err_sk_select_port }, + { test_pass }, + { test_syncookie }, + { test_pass_on_err }, + { test_detach_bpf }, + }; + const struct test *t; printf("######## %s/%s %s ########\n", family_str(family), sotype_str(type), inany ? " INANY " : "LOOPBACK"); - setup_per_test(type, family, inany); + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + setup_per_test(type, family, inany, t->no_inner_map); + t->fn(type, family); + cleanup_per_test(t->no_inner_map); + } - test_err_inner_map(type, family); - - /* Install reuseport_array to the outer_map */ - err = bpf_map_update_elem(outer_map, &index_zero, - &reuseport_array, BPF_ANY); - CHECK(err == -1, "update_elem(outer_map)", - "err:%d errno:%d\n", err, errno); - - test_err_skb_data(type, family); - test_err_sk_select_port(type, family); - test_pass(type, family); - test_syncookie(type, family); - test_pass_on_err(type, family); - /* Must be the last test */ - test_detach_bpf(type, family); - - cleanup_per_test(); printf("\n"); } From 9af6c84435d0252e532c7f8bb70f1ed64a22ae2a Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 12 Dec 2019 11:22:56 +0100 Subject: [PATCH 013/127] selftests/bpf: Propagate errors during setup for reuseport tests Prepare for switching reuseport tests to test_progs framework, where we don't have the luxury to terminate the process on failure. Modify setup helpers to signal failure via the return value with the help of a macro similar to the one currently in use by the tests. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212102259.418536-8-jakub@cloudflare.com --- .../selftests/bpf/test_select_reuseport.c | 136 +++++++++++------- 1 file changed, 85 insertions(+), 51 deletions(-) diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c index cfff958da570..cc35816b7b34 100644 --- a/tools/testing/selftests/bpf/test_select_reuseport.c +++ b/tools/testing/selftests/bpf/test_select_reuseport.c @@ -32,7 +32,7 @@ static int result_map, tmp_index_ovr_map, linum_map, data_check_map; static enum result expected_results[NR_RESULTS]; static int sk_fds[REUSEPORT_ARRAY_SIZE]; -static int reuseport_array, outer_map; +static int reuseport_array = -1, outer_map = -1; static int select_by_skb_data_prog; static int saved_tcp_syncookie; static struct bpf_object *obj; @@ -55,7 +55,16 @@ static union sa46 { } \ }) -static void create_maps(void) +#define RET_ERR(condition, tag, format...) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ + printf(format); \ + return -1; \ + } \ +}) + +static int create_maps(void) { struct bpf_create_map_attr attr = {}; @@ -67,8 +76,8 @@ static void create_maps(void) attr.max_entries = REUSEPORT_ARRAY_SIZE; reuseport_array = bpf_create_map_xattr(&attr); - CHECK(reuseport_array == -1, "creating reuseport_array", - "reuseport_array:%d errno:%d\n", reuseport_array, errno); + RET_ERR(reuseport_array == -1, "creating reuseport_array", + "reuseport_array:%d errno:%d\n", reuseport_array, errno); /* Creating outer_map */ attr.name = "outer_map"; @@ -78,57 +87,61 @@ static void create_maps(void) attr.max_entries = 1; attr.inner_map_fd = reuseport_array; outer_map = bpf_create_map_xattr(&attr); - CHECK(outer_map == -1, "creating outer_map", - "outer_map:%d errno:%d\n", outer_map, errno); + RET_ERR(outer_map == -1, "creating outer_map", + "outer_map:%d errno:%d\n", outer_map, errno); + + return 0; } -static void prepare_bpf_obj(void) +static int prepare_bpf_obj(void) { struct bpf_program *prog; struct bpf_map *map; int err; obj = bpf_object__open("test_select_reuseport_kern.o"); - CHECK(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o", - "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj)); + RET_ERR(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o", + "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj)); map = bpf_object__find_map_by_name(obj, "outer_map"); - CHECK(!map, "find outer_map", "!map\n"); + RET_ERR(!map, "find outer_map", "!map\n"); err = bpf_map__reuse_fd(map, outer_map); - CHECK(err, "reuse outer_map", "err:%d\n", err); + RET_ERR(err, "reuse outer_map", "err:%d\n", err); err = bpf_object__load(obj); - CHECK(err, "load bpf_object", "err:%d\n", err); + RET_ERR(err, "load bpf_object", "err:%d\n", err); prog = bpf_program__next(NULL, obj); - CHECK(!prog, "get first bpf_program", "!prog\n"); + RET_ERR(!prog, "get first bpf_program", "!prog\n"); select_by_skb_data_prog = bpf_program__fd(prog); - CHECK(select_by_skb_data_prog == -1, "get prog fd", - "select_by_skb_data_prog:%d\n", select_by_skb_data_prog); + RET_ERR(select_by_skb_data_prog == -1, "get prog fd", + "select_by_skb_data_prog:%d\n", select_by_skb_data_prog); map = bpf_object__find_map_by_name(obj, "result_map"); - CHECK(!map, "find result_map", "!map\n"); + RET_ERR(!map, "find result_map", "!map\n"); result_map = bpf_map__fd(map); - CHECK(result_map == -1, "get result_map fd", - "result_map:%d\n", result_map); + RET_ERR(result_map == -1, "get result_map fd", + "result_map:%d\n", result_map); map = bpf_object__find_map_by_name(obj, "tmp_index_ovr_map"); - CHECK(!map, "find tmp_index_ovr_map", "!map\n"); + RET_ERR(!map, "find tmp_index_ovr_map\n", "!map"); tmp_index_ovr_map = bpf_map__fd(map); - CHECK(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd", - "tmp_index_ovr_map:%d\n", tmp_index_ovr_map); + RET_ERR(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd", + "tmp_index_ovr_map:%d\n", tmp_index_ovr_map); map = bpf_object__find_map_by_name(obj, "linum_map"); - CHECK(!map, "find linum_map", "!map\n"); + RET_ERR(!map, "find linum_map", "!map\n"); linum_map = bpf_map__fd(map); - CHECK(linum_map == -1, "get linum_map fd", - "linum_map:%d\n", linum_map); + RET_ERR(linum_map == -1, "get linum_map fd", + "linum_map:%d\n", linum_map); map = bpf_object__find_map_by_name(obj, "data_check_map"); - CHECK(!map, "find data_check_map", "!map\n"); + RET_ERR(!map, "find data_check_map", "!map\n"); data_check_map = bpf_map__fd(map); - CHECK(data_check_map == -1, "get data_check_map fd", - "data_check_map:%d\n", data_check_map); + RET_ERR(data_check_map == -1, "get data_check_map fd", + "data_check_map:%d\n", data_check_map); + + return 0; } static void sa46_init_loopback(union sa46 *sa, sa_family_t family) @@ -157,31 +170,34 @@ static int read_int_sysctl(const char *sysctl) int fd, ret; fd = open(sysctl, 0); - CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n", - sysctl, fd, errno); + RET_ERR(fd == -1, "open(sysctl)", + "sysctl:%s fd:%d errno:%d\n", sysctl, fd, errno); ret = read(fd, buf, sizeof(buf)); - CHECK(ret <= 0, "read(sysctl)", "sysctl:%s ret:%d errno:%d\n", - sysctl, ret, errno); - close(fd); + RET_ERR(ret <= 0, "read(sysctl)", + "sysctl:%s ret:%d errno:%d\n", sysctl, ret, errno); + close(fd); return atoi(buf); } -static void write_int_sysctl(const char *sysctl, int v) +static int write_int_sysctl(const char *sysctl, int v) { int fd, ret, size; char buf[16]; fd = open(sysctl, O_RDWR); - CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n", - sysctl, fd, errno); + RET_ERR(fd == -1, "open(sysctl)", + "sysctl:%s fd:%d errno:%d\n", sysctl, fd, errno); size = snprintf(buf, sizeof(buf), "%d", v); ret = write(fd, buf, size); - CHECK(ret != size, "write(sysctl)", - "sysctl:%s ret:%d size:%d errno:%d\n", sysctl, ret, size, errno); + RET_ERR(ret != size, "write(sysctl)", + "sysctl:%s ret:%d size:%d errno:%d\n", + sysctl, ret, size, errno); + close(fd); + return 0; } static void restore_sysctls(void) @@ -190,22 +206,25 @@ static void restore_sysctls(void) write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie); } -static void enable_fastopen(void) +static int enable_fastopen(void) { int fo; fo = read_int_sysctl(TCP_FO_SYSCTL); - write_int_sysctl(TCP_FO_SYSCTL, fo | 7); + if (fo < 0) + return -1; + + return write_int_sysctl(TCP_FO_SYSCTL, fo | 7); } -static void enable_syncookie(void) +static int enable_syncookie(void) { - write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 2); + return write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 2); } -static void disable_syncookie(void) +static int disable_syncookie(void) { - write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0); + return write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0); } static __u32 get_linum(void) @@ -683,9 +702,12 @@ static void cleanup_per_test(bool no_inner_map) static void cleanup(void) { - close(outer_map); - close(reuseport_array); - bpf_object__close(obj); + if (outer_map != -1) + close(outer_map); + if (reuseport_array != -1) + close(reuseport_array); + if (obj) + bpf_object__close(obj); } static const char *family_str(sa_family_t family) @@ -765,16 +787,28 @@ static void test_all(void) int main(int argc, const char **argv) { - create_maps(); - prepare_bpf_obj(); + int ret = EXIT_FAILURE; + + if (create_maps()) + goto out; + if (prepare_bpf_obj()) + goto out; + saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL); - enable_fastopen(); - disable_syncookie(); + if (saved_tcp_syncookie < 0 || saved_tcp_syncookie < 0) + goto out; atexit(restore_sysctls); + if (enable_fastopen()) + goto out; + if (disable_syncookie()) + goto out; + test_all(); + ret = EXIT_SUCCESS; +out: cleanup(); - return 0; + return ret; } From 250a91d48ae7616672a34b58388620a554a00180 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 12 Dec 2019 11:22:57 +0100 Subject: [PATCH 014/127] selftests/bpf: Pull up printing the test name into test runner Again, prepare for switching reuseport tests to test_progs framework. test_progs framework will print the subtest name for us if we set it. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212102259.418536-9-jakub@cloudflare.com --- .../selftests/bpf/test_select_reuseport.c | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c index cc35816b7b34..0d5687feb689 100644 --- a/tools/testing/selftests/bpf/test_select_reuseport.c +++ b/tools/testing/selftests/bpf/test_select_reuseport.c @@ -441,7 +441,6 @@ static void test_err_inner_map(int type, sa_family_t family) .pass_on_failure = 0, }; - printf("%s: ", __func__); expected_results[DROP_ERR_INNER_MAP]++; do_test(type, family, &cmd, DROP_ERR_INNER_MAP); printf("OK\n"); @@ -449,7 +448,6 @@ static void test_err_inner_map(int type, sa_family_t family) static void test_err_skb_data(int type, sa_family_t family) { - printf("%s: ", __func__); expected_results[DROP_ERR_SKB_DATA]++; do_test(type, family, NULL, DROP_ERR_SKB_DATA); printf("OK\n"); @@ -462,7 +460,6 @@ static void test_err_sk_select_port(int type, sa_family_t family) .pass_on_failure = 0, }; - printf("%s: ", __func__); expected_results[DROP_ERR_SK_SELECT_REUSEPORT]++; do_test(type, family, &cmd, DROP_ERR_SK_SELECT_REUSEPORT); printf("OK\n"); @@ -473,7 +470,6 @@ static void test_pass(int type, sa_family_t family) struct cmd cmd; int i; - printf("%s: ", __func__); cmd.pass_on_failure = 0; for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) { expected_results[PASS]++; @@ -494,7 +490,6 @@ static void test_syncookie(int type, sa_family_t family) if (type != SOCK_STREAM) return; - printf("%s: ", __func__); /* * +1 for TCP-SYN and * +1 for the TCP-ACK (ack the syncookie) @@ -530,7 +525,6 @@ static void test_pass_on_err(int type, sa_family_t family) .pass_on_failure = 1, }; - printf("%s: ", __func__); expected_results[PASS_ERR_SK_SELECT_REUSEPORT] += 1; do_test(type, family, &cmd, PASS_ERR_SK_SELECT_REUSEPORT); printf("OK\n"); @@ -545,7 +539,6 @@ static void test_detach_bpf(int type, sa_family_t family) struct cmd cmd = {}; int optvalue = 0; - printf("%s: ", __func__); err = setsockopt(sk_fds[0], SOL_SOCKET, SO_DETACH_REUSEPORT_BPF, &optvalue, sizeof(optvalue)); CHECK(err == -1, "setsockopt(SO_DETACH_REUSEPORT_BPF)", @@ -584,7 +577,7 @@ static void test_detach_bpf(int type, sa_family_t family) printf("OK\n"); close(cli_fd); #else - printf("%s: SKIP\n", __func__); + printf("SKIP\n"); #endif } @@ -734,19 +727,22 @@ static const char *sotype_str(int sotype) } } +#define TEST_INIT(fn, ...) { fn, #fn, __VA_ARGS__ } + static void test_config(int type, sa_family_t family, bool inany) { const struct test { void (*fn)(int sotype, sa_family_t family); + const char *name; bool no_inner_map; } tests[] = { - { test_err_inner_map, true /* no_inner_map */ }, - { test_err_skb_data }, - { test_err_sk_select_port }, - { test_pass }, - { test_syncookie }, - { test_pass_on_err }, - { test_detach_bpf }, + TEST_INIT(test_err_inner_map, true /* no_inner_map */), + TEST_INIT(test_err_skb_data), + TEST_INIT(test_err_sk_select_port), + TEST_INIT(test_pass), + TEST_INIT(test_syncookie), + TEST_INIT(test_pass_on_err), + TEST_INIT(test_detach_bpf), }; const struct test *t; @@ -756,6 +752,7 @@ static void test_config(int type, sa_family_t family, bool inany) for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { setup_per_test(type, family, inany, t->no_inner_map); + printf("%s: ", t->name); t->fn(type, family); cleanup_per_test(t->no_inner_map); } From 415bb4e125b68a3aaccb8381baaa4a06dd1dcb27 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 12 Dec 2019 11:22:58 +0100 Subject: [PATCH 015/127] selftests/bpf: Move reuseport tests under prog_tests/ Do a pure move the show the actual work needed to adapt the tests in subsequent patch at the cost of breaking test_progs build for the moment. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212102259.418536-10-jakub@cloudflare.com --- tools/testing/selftests/bpf/Makefile | 2 +- .../{test_select_reuseport.c => prog_tests/select_reuseport.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tools/testing/selftests/bpf/{test_select_reuseport.c => prog_tests/select_reuseport.c} (100%) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e0fe01d9ec33..90de7d75b5c6 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -28,7 +28,7 @@ LDLIBS += -lcap -lelf -lrt -lpthread TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \ - test_cgroup_storage test_select_reuseport \ + test_cgroup_storage \ test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \ test_cgroup_attach test_progs-no_alu32 diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c similarity index 100% rename from tools/testing/selftests/bpf/test_select_reuseport.c rename to tools/testing/selftests/bpf/prog_tests/select_reuseport.c From 7ee0d4e97b889c0478af9c1a6e5af658b181423f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 12 Dec 2019 11:22:59 +0100 Subject: [PATCH 016/127] selftests/bpf: Switch reuseport tests for test_progs framework The tests were originally written in abort-on-error style. With the switch to test_progs we can no longer do that. So at the risk of not cleaning up some resource on failure, we now return to the caller on error. That said, failure inside one test should not affect others because we run setup/cleanup before/after every test. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212102259.418536-11-jakub@cloudflare.com --- .../bpf/prog_tests/select_reuseport.c | 267 +++++++++--------- 1 file changed, 131 insertions(+), 136 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 0d5687feb689..2c37ae7dc214 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -20,8 +20,11 @@ #include #include "bpf_rlimit.h" #include "bpf_util.h" + +#include "test_progs.h" #include "test_select_reuseport_common.h" +#define MAX_TEST_NAME 80 #define MIN_TCPHDR_LEN 20 #define UDPHDR_LEN 8 @@ -34,9 +37,9 @@ static enum result expected_results[NR_RESULTS]; static int sk_fds[REUSEPORT_ARRAY_SIZE]; static int reuseport_array = -1, outer_map = -1; static int select_by_skb_data_prog; -static int saved_tcp_syncookie; +static int saved_tcp_syncookie = -1; static struct bpf_object *obj; -static int saved_tcp_fo; +static int saved_tcp_fo = -1; static __u32 index_zero; static int epfd; @@ -46,20 +49,16 @@ static union sa46 { sa_family_t family; } srv_sa; -#define CHECK(condition, tag, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ - printf(format); \ - exit(-1); \ +#define RET_IF(condition, tag, format...) ({ \ + if (CHECK_FAIL(condition)) { \ + printf(tag " " format); \ + return; \ } \ }) #define RET_ERR(condition, tag, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ - printf(format); \ + if (CHECK_FAIL(condition)) { \ + printf(tag " " format); \ return -1; \ } \ }) @@ -202,8 +201,10 @@ static int write_int_sysctl(const char *sysctl, int v) static void restore_sysctls(void) { - write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo); - write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie); + if (saved_tcp_fo != -1) + write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo); + if (saved_tcp_syncookie != -1) + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie); } static int enable_fastopen(void) @@ -227,14 +228,14 @@ static int disable_syncookie(void) return write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0); } -static __u32 get_linum(void) +static long get_linum(void) { __u32 linum; int err; err = bpf_map_lookup_elem(linum_map, &index_zero, &linum); - CHECK(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n", - err, errno); + RET_ERR(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n", + err, errno); return linum; } @@ -250,12 +251,12 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd, addrlen = sizeof(cli_sa); err = getsockname(cli_fd, (struct sockaddr *)&cli_sa, &addrlen); - CHECK(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n", - err, errno); + RET_IF(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n", + err, errno); err = bpf_map_lookup_elem(data_check_map, &index_zero, &result); - CHECK(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n", - err, errno); + RET_IF(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n", + err, errno); if (type == SOCK_STREAM) { expected.len = MIN_TCPHDR_LEN; @@ -297,22 +298,22 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd, printf("expected: (0x%x, %u, %u)\n", expected.eth_protocol, expected.ip_protocol, expected.bind_inany); - CHECK(1, "data_check result != expected", - "bpf_prog_linum:%u\n", get_linum()); + RET_IF(1, "data_check result != expected", + "bpf_prog_linum:%ld\n", get_linum()); } - CHECK(!result.hash, "data_check result.hash empty", - "result.hash:%u", result.hash); + RET_IF(!result.hash, "data_check result.hash empty", + "result.hash:%u", result.hash); expected.len += cmd ? sizeof(*cmd) : 0; if (type == SOCK_STREAM) - CHECK(expected.len > result.len, "expected.len > result.len", - "expected.len:%u result.len:%u bpf_prog_linum:%u\n", - expected.len, result.len, get_linum()); + RET_IF(expected.len > result.len, "expected.len > result.len", + "expected.len:%u result.len:%u bpf_prog_linum:%ld\n", + expected.len, result.len, get_linum()); else - CHECK(expected.len != result.len, "expected.len != result.len", - "expected.len:%u result.len:%u bpf_prog_linum:%u\n", - expected.len, result.len, get_linum()); + RET_IF(expected.len != result.len, "expected.len != result.len", + "expected.len:%u result.len:%u bpf_prog_linum:%ld\n", + expected.len, result.len, get_linum()); } static void check_results(void) @@ -323,8 +324,8 @@ static void check_results(void) for (i = 0; i < NR_RESULTS; i++) { err = bpf_map_lookup_elem(result_map, &i, &results[i]); - CHECK(err == -1, "lookup_elem(result_map)", - "i:%u err:%d errno:%d\n", i, err, errno); + RET_IF(err == -1, "lookup_elem(result_map)", + "i:%u err:%d errno:%d\n", i, err, errno); } for (i = 0; i < NR_RESULTS; i++) { @@ -350,10 +351,10 @@ static void check_results(void) printf(", %u", expected_results[i]); printf("]\n"); - CHECK(expected_results[broken] != results[broken], - "unexpected result", - "expected_results[%u] != results[%u] bpf_prog_linum:%u\n", - broken, broken, get_linum()); + RET_IF(expected_results[broken] != results[broken], + "unexpected result", + "expected_results[%u] != results[%u] bpf_prog_linum:%ld\n", + broken, broken, get_linum()); } static int send_data(int type, sa_family_t family, void *data, size_t len, @@ -363,17 +364,17 @@ static int send_data(int type, sa_family_t family, void *data, size_t len, int fd, err; fd = socket(family, type, 0); - CHECK(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno); + RET_ERR(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno); sa46_init_loopback(&cli_sa, family); err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa)); - CHECK(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno); + RET_ERR(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno); err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa, sizeof(srv_sa)); - CHECK(err != len && expected >= PASS, - "sendto()", "family:%u err:%d errno:%d expected:%d\n", - family, err, errno, expected); + RET_ERR(err != len && expected >= PASS, + "sendto()", "family:%u err:%d errno:%d expected:%d\n", + family, err, errno, expected); return fd; } @@ -388,47 +389,49 @@ static void do_test(int type, sa_family_t family, struct cmd *cmd, cli_fd = send_data(type, family, cmd, cmd ? sizeof(*cmd) : 0, expected); + if (cli_fd < 0) + return; nev = epoll_wait(epfd, &ev, 1, expected >= PASS ? 5 : 0); - CHECK((nev <= 0 && expected >= PASS) || - (nev > 0 && expected < PASS), - "nev <> expected", - "nev:%d expected:%d type:%d family:%d data:(%d, %d)\n", - nev, expected, type, family, - cmd ? cmd->reuseport_index : -1, - cmd ? cmd->pass_on_failure : -1); + RET_IF((nev <= 0 && expected >= PASS) || + (nev > 0 && expected < PASS), + "nev <> expected", + "nev:%d expected:%d type:%d family:%d data:(%d, %d)\n", + nev, expected, type, family, + cmd ? cmd->reuseport_index : -1, + cmd ? cmd->pass_on_failure : -1); check_results(); check_data(type, family, cmd, cli_fd); if (expected < PASS) return; - CHECK(expected != PASS_ERR_SK_SELECT_REUSEPORT && - cmd->reuseport_index != ev.data.u32, - "check cmd->reuseport_index", - "cmd:(%u, %u) ev.data.u32:%u\n", - cmd->pass_on_failure, cmd->reuseport_index, ev.data.u32); + RET_IF(expected != PASS_ERR_SK_SELECT_REUSEPORT && + cmd->reuseport_index != ev.data.u32, + "check cmd->reuseport_index", + "cmd:(%u, %u) ev.data.u32:%u\n", + cmd->pass_on_failure, cmd->reuseport_index, ev.data.u32); srv_fd = sk_fds[ev.data.u32]; if (type == SOCK_STREAM) { int new_fd = accept(srv_fd, NULL, 0); - CHECK(new_fd == -1, "accept(srv_fd)", - "ev.data.u32:%u new_fd:%d errno:%d\n", - ev.data.u32, new_fd, errno); + RET_IF(new_fd == -1, "accept(srv_fd)", + "ev.data.u32:%u new_fd:%d errno:%d\n", + ev.data.u32, new_fd, errno); nread = recv(new_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT); - CHECK(nread != sizeof(rcv_cmd), - "recv(new_fd)", - "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", - ev.data.u32, nread, sizeof(rcv_cmd), errno); + RET_IF(nread != sizeof(rcv_cmd), + "recv(new_fd)", + "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", + ev.data.u32, nread, sizeof(rcv_cmd), errno); close(new_fd); } else { nread = recv(srv_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT); - CHECK(nread != sizeof(rcv_cmd), - "recv(sk_fds)", - "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", - ev.data.u32, nread, sizeof(rcv_cmd), errno); + RET_IF(nread != sizeof(rcv_cmd), + "recv(sk_fds)", + "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", + ev.data.u32, nread, sizeof(rcv_cmd), errno); } close(cli_fd); @@ -443,14 +446,12 @@ static void test_err_inner_map(int type, sa_family_t family) expected_results[DROP_ERR_INNER_MAP]++; do_test(type, family, &cmd, DROP_ERR_INNER_MAP); - printf("OK\n"); } static void test_err_skb_data(int type, sa_family_t family) { expected_results[DROP_ERR_SKB_DATA]++; do_test(type, family, NULL, DROP_ERR_SKB_DATA); - printf("OK\n"); } static void test_err_sk_select_port(int type, sa_family_t family) @@ -462,7 +463,6 @@ static void test_err_sk_select_port(int type, sa_family_t family) expected_results[DROP_ERR_SK_SELECT_REUSEPORT]++; do_test(type, family, &cmd, DROP_ERR_SK_SELECT_REUSEPORT); - printf("OK\n"); } static void test_pass(int type, sa_family_t family) @@ -476,7 +476,6 @@ static void test_pass(int type, sa_family_t family) cmd.reuseport_index = i; do_test(type, family, &cmd, PASS); } - printf("OK\n"); } static void test_syncookie(int type, sa_family_t family) @@ -505,17 +504,16 @@ static void test_syncookie(int type, sa_family_t family) */ err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &tmp_index, BPF_ANY); - CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)", - "err:%d errno:%d\n", err, errno); + RET_IF(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)", + "err:%d errno:%d\n", err, errno); do_test(type, family, &cmd, PASS); err = bpf_map_lookup_elem(tmp_index_ovr_map, &index_zero, &tmp_index); - CHECK(err == -1 || tmp_index != -1, - "lookup_elem(tmp_index_ovr_map)", - "err:%d errno:%d tmp_index:%d\n", - err, errno, tmp_index); + RET_IF(err == -1 || tmp_index != -1, + "lookup_elem(tmp_index_ovr_map)", + "err:%d errno:%d tmp_index:%d\n", + err, errno, tmp_index); disable_syncookie(); - printf("OK\n"); } static void test_pass_on_err(int type, sa_family_t family) @@ -527,7 +525,6 @@ static void test_pass_on_err(int type, sa_family_t family) expected_results[PASS_ERR_SK_SELECT_REUSEPORT] += 1; do_test(type, family, &cmd, PASS_ERR_SK_SELECT_REUSEPORT); - printf("OK\n"); } static void test_detach_bpf(int type, sa_family_t family) @@ -541,43 +538,45 @@ static void test_detach_bpf(int type, sa_family_t family) err = setsockopt(sk_fds[0], SOL_SOCKET, SO_DETACH_REUSEPORT_BPF, &optvalue, sizeof(optvalue)); - CHECK(err == -1, "setsockopt(SO_DETACH_REUSEPORT_BPF)", - "err:%d errno:%d\n", err, errno); + RET_IF(err == -1, "setsockopt(SO_DETACH_REUSEPORT_BPF)", + "err:%d errno:%d\n", err, errno); err = setsockopt(sk_fds[1], SOL_SOCKET, SO_DETACH_REUSEPORT_BPF, &optvalue, sizeof(optvalue)); - CHECK(err == 0 || errno != ENOENT, "setsockopt(SO_DETACH_REUSEPORT_BPF)", - "err:%d errno:%d\n", err, errno); + RET_IF(err == 0 || errno != ENOENT, + "setsockopt(SO_DETACH_REUSEPORT_BPF)", + "err:%d errno:%d\n", err, errno); for (i = 0; i < NR_RESULTS; i++) { err = bpf_map_lookup_elem(result_map, &i, &tmp); - CHECK(err == -1, "lookup_elem(result_map)", - "i:%u err:%d errno:%d\n", i, err, errno); + RET_IF(err == -1, "lookup_elem(result_map)", + "i:%u err:%d errno:%d\n", i, err, errno); nr_run_before += tmp; } cli_fd = send_data(type, family, &cmd, sizeof(cmd), PASS); + if (cli_fd < 0) + return; nev = epoll_wait(epfd, &ev, 1, 5); - CHECK(nev <= 0, "nev <= 0", - "nev:%d expected:1 type:%d family:%d data:(0, 0)\n", - nev, type, family); + RET_IF(nev <= 0, "nev <= 0", + "nev:%d expected:1 type:%d family:%d data:(0, 0)\n", + nev, type, family); for (i = 0; i < NR_RESULTS; i++) { err = bpf_map_lookup_elem(result_map, &i, &tmp); - CHECK(err == -1, "lookup_elem(result_map)", - "i:%u err:%d errno:%d\n", i, err, errno); + RET_IF(err == -1, "lookup_elem(result_map)", + "i:%u err:%d errno:%d\n", i, err, errno); nr_run_after += tmp; } - CHECK(nr_run_before != nr_run_after, - "nr_run_before != nr_run_after", - "nr_run_before:%u nr_run_after:%u\n", - nr_run_before, nr_run_after); + RET_IF(nr_run_before != nr_run_after, + "nr_run_before != nr_run_after", + "nr_run_before:%u nr_run_after:%u\n", + nr_run_before, nr_run_after); - printf("OK\n"); close(cli_fd); #else - printf("SKIP\n"); + test__skip(); #endif } @@ -600,58 +599,58 @@ static void prepare_sk_fds(int type, sa_family_t family, bool inany) */ for (i = first; i >= 0; i--) { sk_fds[i] = socket(family, type, 0); - CHECK(sk_fds[i] == -1, "socket()", "sk_fds[%d]:%d errno:%d\n", - i, sk_fds[i], errno); + RET_IF(sk_fds[i] == -1, "socket()", "sk_fds[%d]:%d errno:%d\n", + i, sk_fds[i], errno); err = setsockopt(sk_fds[i], SOL_SOCKET, SO_REUSEPORT, &optval, sizeof(optval)); - CHECK(err == -1, "setsockopt(SO_REUSEPORT)", - "sk_fds[%d] err:%d errno:%d\n", - i, err, errno); + RET_IF(err == -1, "setsockopt(SO_REUSEPORT)", + "sk_fds[%d] err:%d errno:%d\n", + i, err, errno); if (i == first) { err = setsockopt(sk_fds[i], SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &select_by_skb_data_prog, sizeof(select_by_skb_data_prog)); - CHECK(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)", - "err:%d errno:%d\n", err, errno); + RET_IF(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)", + "err:%d errno:%d\n", err, errno); } err = bind(sk_fds[i], (struct sockaddr *)&srv_sa, addrlen); - CHECK(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n", - i, err, errno); + RET_IF(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n", + i, err, errno); if (type == SOCK_STREAM) { err = listen(sk_fds[i], 10); - CHECK(err == -1, "listen()", - "sk_fds[%d] err:%d errno:%d\n", - i, err, errno); + RET_IF(err == -1, "listen()", + "sk_fds[%d] err:%d errno:%d\n", + i, err, errno); } err = bpf_map_update_elem(reuseport_array, &i, &sk_fds[i], BPF_NOEXIST); - CHECK(err == -1, "update_elem(reuseport_array)", - "sk_fds[%d] err:%d errno:%d\n", i, err, errno); + RET_IF(err == -1, "update_elem(reuseport_array)", + "sk_fds[%d] err:%d errno:%d\n", i, err, errno); if (i == first) { socklen_t addrlen = sizeof(srv_sa); err = getsockname(sk_fds[i], (struct sockaddr *)&srv_sa, &addrlen); - CHECK(err == -1, "getsockname()", - "sk_fds[%d] err:%d errno:%d\n", i, err, errno); + RET_IF(err == -1, "getsockname()", + "sk_fds[%d] err:%d errno:%d\n", i, err, errno); } } epfd = epoll_create(1); - CHECK(epfd == -1, "epoll_create(1)", - "epfd:%d errno:%d\n", epfd, errno); + RET_IF(epfd == -1, "epoll_create(1)", + "epfd:%d errno:%d\n", epfd, errno); ev.events = EPOLLIN; for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) { ev.data.u32 = i; err = epoll_ctl(epfd, EPOLL_CTL_ADD, sk_fds[i], &ev); - CHECK(err, "epoll_ctl(EPOLL_CTL_ADD)", "sk_fds[%d]\n", i); + RET_IF(err, "epoll_ctl(EPOLL_CTL_ADD)", "sk_fds[%d]\n", i); } } @@ -663,8 +662,8 @@ static void setup_per_test(int type, sa_family_t family, bool inany, prepare_sk_fds(type, family, inany); err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &ovr, BPF_ANY); - CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)", - "err:%d errno:%d\n", err, errno); + RET_IF(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)", + "err:%d errno:%d\n", err, errno); /* Install reuseport_array to outer_map? */ if (no_inner_map) @@ -672,8 +671,8 @@ static void setup_per_test(int type, sa_family_t family, bool inany, err = bpf_map_update_elem(outer_map, &index_zero, &reuseport_array, BPF_ANY); - CHECK(err == -1, "update_elem(outer_map, 0, reuseport_array)", - "err:%d errno:%d\n", err, errno); + RET_IF(err == -1, "update_elem(outer_map, 0, reuseport_array)", + "err:%d errno:%d\n", err, errno); } static void cleanup_per_test(bool no_inner_map) @@ -689,8 +688,8 @@ static void cleanup_per_test(bool no_inner_map) return; err = bpf_map_delete_elem(outer_map, &index_zero); - CHECK(err == -1, "delete_elem(outer_map)", - "err:%d errno:%d\n", err, errno); + RET_IF(err == -1, "delete_elem(outer_map)", + "err:%d errno:%d\n", err, errno); } static void cleanup(void) @@ -729,7 +728,7 @@ static const char *sotype_str(int sotype) #define TEST_INIT(fn, ...) { fn, #fn, __VA_ARGS__ } -static void test_config(int type, sa_family_t family, bool inany) +static void test_config(int sotype, sa_family_t family, bool inany) { const struct test { void (*fn)(int sotype, sa_family_t family); @@ -744,20 +743,21 @@ static void test_config(int type, sa_family_t family, bool inany) TEST_INIT(test_pass_on_err), TEST_INIT(test_detach_bpf), }; + char s[MAX_TEST_NAME]; const struct test *t; - printf("######## %s/%s %s ########\n", - family_str(family), sotype_str(type), - inany ? " INANY " : "LOOPBACK"); - for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { - setup_per_test(type, family, inany, t->no_inner_map); - printf("%s: ", t->name); - t->fn(type, family); + snprintf(s, sizeof(s), "%s/%s %s %s", + family_str(family), sotype_str(sotype), + inany ? "INANY" : "LOOPBACK", t->name); + + if (!test__start_subtest(s)) + continue; + + setup_per_test(sotype, family, inany, t->no_inner_map); + t->fn(sotype, family); cleanup_per_test(t->no_inner_map); } - - printf("\n"); } #define BIND_INANY true @@ -782,10 +782,8 @@ static void test_all(void) test_config(c->sotype, c->family, c->inany); } -int main(int argc, const char **argv) +void test_select_reuseport(void) { - int ret = EXIT_FAILURE; - if (create_maps()) goto out; if (prepare_bpf_obj()) @@ -795,7 +793,6 @@ int main(int argc, const char **argv) saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL); if (saved_tcp_syncookie < 0 || saved_tcp_syncookie < 0) goto out; - atexit(restore_sysctls); if (enable_fastopen()) goto out; @@ -803,9 +800,7 @@ int main(int argc, const char **argv) goto out; test_all(); - - ret = EXIT_SUCCESS; out: cleanup(); - return ret; + restore_sysctls(); } From 6803ee25f0ead1e836808acb14396bb9a9849113 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 11 Dec 2019 17:35:48 -0800 Subject: [PATCH 017/127] libbpf: Extract and generalize CPU mask parsing logic This logic is re-used for parsing a set of online CPUs. Having it as an isolated piece of code working with input string makes it conveninent to test this logic as well. While refactoring, also improve the robustness of original implementation. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212013548.1690564-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 131 +++++++++++++++++++++----------- tools/lib/bpf/libbpf_internal.h | 2 + 2 files changed, 89 insertions(+), 44 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b99c0a9b7756..98455e8aa6db 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6523,61 +6523,104 @@ void bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear) } } +int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz) +{ + int err = 0, n, len, start, end = -1; + bool *tmp; + + *mask = NULL; + *mask_sz = 0; + + /* Each sub string separated by ',' has format \d+-\d+ or \d+ */ + while (*s) { + if (*s == ',' || *s == '\n') { + s++; + continue; + } + n = sscanf(s, "%d%n-%d%n", &start, &len, &end, &len); + if (n <= 0 || n > 2) { + pr_warn("Failed to get CPU range %s: %d\n", s, n); + err = -EINVAL; + goto cleanup; + } else if (n == 1) { + end = start; + } + if (start < 0 || start > end) { + pr_warn("Invalid CPU range [%d,%d] in %s\n", + start, end, s); + err = -EINVAL; + goto cleanup; + } + tmp = realloc(*mask, end + 1); + if (!tmp) { + err = -ENOMEM; + goto cleanup; + } + *mask = tmp; + memset(tmp + *mask_sz, 0, start - *mask_sz); + memset(tmp + start, 1, end - start + 1); + *mask_sz = end + 1; + s += len; + } + if (!*mask_sz) { + pr_warn("Empty CPU range\n"); + return -EINVAL; + } + return 0; +cleanup: + free(*mask); + *mask = NULL; + return err; +} + +int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz) +{ + int fd, err = 0, len; + char buf[128]; + + fd = open(fcpu, O_RDONLY); + if (fd < 0) { + err = -errno; + pr_warn("Failed to open cpu mask file %s: %d\n", fcpu, err); + return err; + } + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len <= 0) { + err = len ? -errno : -EINVAL; + pr_warn("Failed to read cpu mask from %s: %d\n", fcpu, err); + return err; + } + if (len >= sizeof(buf)) { + pr_warn("CPU mask is too big in file %s\n", fcpu); + return -E2BIG; + } + buf[len] = '\0'; + + return parse_cpu_mask_str(buf, mask, mask_sz); +} + int libbpf_num_possible_cpus(void) { static const char *fcpu = "/sys/devices/system/cpu/possible"; - int len = 0, n = 0, il = 0, ir = 0; - unsigned int start = 0, end = 0; - int tmp_cpus = 0; static int cpus; - char buf[128]; - int error = 0; - int fd = -1; + int err, n, i, tmp_cpus; + bool *mask; tmp_cpus = READ_ONCE(cpus); if (tmp_cpus > 0) return tmp_cpus; - fd = open(fcpu, O_RDONLY); - if (fd < 0) { - error = errno; - pr_warn("Failed to open file %s: %s\n", fcpu, strerror(error)); - return -error; - } - len = read(fd, buf, sizeof(buf)); - close(fd); - if (len <= 0) { - error = len ? errno : EINVAL; - pr_warn("Failed to read # of possible cpus from %s: %s\n", - fcpu, strerror(error)); - return -error; - } - if (len == sizeof(buf)) { - pr_warn("File %s size overflow\n", fcpu); - return -EOVERFLOW; - } - buf[len] = '\0'; + err = parse_cpu_mask_file(fcpu, &mask, &n); + if (err) + return err; - for (ir = 0, tmp_cpus = 0; ir <= len; ir++) { - /* Each sub string separated by ',' has format \d+-\d+ or \d+ */ - if (buf[ir] == ',' || buf[ir] == '\0') { - buf[ir] = '\0'; - n = sscanf(&buf[il], "%u-%u", &start, &end); - if (n <= 0) { - pr_warn("Failed to get # CPUs from %s\n", - &buf[il]); - return -EINVAL; - } else if (n == 1) { - end = start; - } - tmp_cpus += end - start + 1; - il = ir + 1; - } - } - if (tmp_cpus <= 0) { - pr_warn("Invalid #CPUs %d from %s\n", tmp_cpus, fcpu); - return -EINVAL; + tmp_cpus = 0; + for (i = 0; i < n; i++) { + if (mask[i]) + tmp_cpus++; } + free(mask); WRITE_ONCE(cpus, tmp_cpus); return tmp_cpus; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 97ac17a64a58..f4f10715db40 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -95,6 +95,8 @@ static inline bool libbpf_validate_opts(const char *opts, #define OPTS_GET(opts, field, fallback_value) \ (OPTS_HAS(opts, field) ? (opts)->field : fallback_value) +int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz); +int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); int libbpf__load_raw_btf(const char *raw_types, size_t types_len, const char *str_sec, size_t str_len); From 65bc4c4063ae36223375a920b7d116e54d7def34 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 11 Dec 2019 17:35:58 -0800 Subject: [PATCH 018/127] selftests/bpf: Add CPU mask parsing tests Add a bunch of test validating CPU mask parsing logic and error handling. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212013559.1690898-1-andriin@fb.com --- .../selftests/bpf/prog_tests/cpu_mask.c | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/cpu_mask.c diff --git a/tools/testing/selftests/bpf/prog_tests/cpu_mask.c b/tools/testing/selftests/bpf/prog_tests/cpu_mask.c new file mode 100644 index 000000000000..1fa1bdbaffa9 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cpu_mask.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "libbpf_internal.h" + +static int duration = 0; + +static void validate_mask(int case_nr, const char *exp, bool *mask, int n) +{ + int i; + + for (i = 0; exp[i]; i++) { + if (exp[i] == '1') { + if (CHECK(i + 1 > n, "mask_short", + "case #%d: mask too short, got n=%d, need at least %d\n", + case_nr, n, i + 1)) + return; + CHECK(!mask[i], "cpu_not_set", + "case #%d: mask differs, expected cpu#%d SET\n", + case_nr, i); + } else { + CHECK(i < n && mask[i], "cpu_set", + "case #%d: mask differs, expected cpu#%d UNSET\n", + case_nr, i); + } + } + CHECK(i < n, "mask_long", + "case #%d: mask too long, got n=%d, expected at most %d\n", + case_nr, n, i); +} + +static struct { + const char *cpu_mask; + const char *expect; + bool fails; +} test_cases[] = { + { "0\n", "1", false }, + { "0,2\n", "101", false }, + { "0-2\n", "111", false }, + { "0-2,3-4\n", "11111", false }, + { "0", "1", false }, + { "0-2", "111", false }, + { "0,2", "101", false }, + { "0,1-3", "1111", false }, + { "0,1,2,3", "1111", false }, + { "0,2-3,5", "101101", false }, + { "3-3", "0001", false }, + { "2-4,6,9-10", "00111010011", false }, + /* failure cases */ + { "", "", true }, + { "0-", "", true }, + { "0 ", "", true }, + { "0_1", "", true }, + { "1-0", "", true }, + { "-1", "", true }, +}; + +void test_cpu_mask() +{ + int i, err, n; + bool *mask; + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { + mask = NULL; + err = parse_cpu_mask_str(test_cases[i].cpu_mask, &mask, &n); + if (test_cases[i].fails) { + CHECK(!err, "should_fail", + "case #%d: parsing should fail!\n", i + 1); + } else { + if (CHECK(err, "parse_err", + "case #%d: cpu mask parsing failed: %d\n", + i + 1, err)) + continue; + validate_mask(i + 1, test_cases[i].expect, mask, n); + } + free(mask); + } +} From 783b8f01f5942a786998f5577bd9ff3992f22a1a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 11 Dec 2019 17:36:09 -0800 Subject: [PATCH 019/127] libbpf: Don't attach perf_buffer to offline/missing CPUs It's quite common on some systems to have more CPUs enlisted as "possible", than there are (and could ever be) present/online CPUs. In such cases, perf_buffer creationg will fail due to inability to create perf event on missing CPU with error like this: libbpf: failed to open perf buffer event on cpu #16: No such device This patch fixes the logic of perf_buffer__new() to ignore CPUs that are missing or currently offline. In rare cases where user explicitly listed specific CPUs to connect to, behavior is unchanged: libbpf will try to open perf event buffer on specified CPU(s) anyways. Fixes: fb84b8224655 ("libbpf: add perf buffer API") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212013609.1691168-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 98455e8aa6db..27d5f7ecba32 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5946,7 +5946,7 @@ struct perf_buffer { size_t mmap_size; struct perf_cpu_buf **cpu_bufs; struct epoll_event *events; - int cpu_cnt; + int cpu_cnt; /* number of allocated CPU buffers */ int epoll_fd; /* perf event FD */ int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */ }; @@ -6080,11 +6080,13 @@ perf_buffer__new_raw(int map_fd, size_t page_cnt, static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, struct perf_buffer_params *p) { + const char *online_cpus_file = "/sys/devices/system/cpu/online"; struct bpf_map_info map = {}; char msg[STRERR_BUFSIZE]; struct perf_buffer *pb; + bool *online = NULL; __u32 map_info_len; - int err, i; + int err, i, j, n; if (page_cnt & (page_cnt - 1)) { pr_warn("page count should be power of two, but is %zu\n", @@ -6153,20 +6155,32 @@ static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, goto error; } - for (i = 0; i < pb->cpu_cnt; i++) { + err = parse_cpu_mask_file(online_cpus_file, &online, &n); + if (err) { + pr_warn("failed to get online CPU mask: %d\n", err); + goto error; + } + + for (i = 0, j = 0; i < pb->cpu_cnt; i++) { struct perf_cpu_buf *cpu_buf; int cpu, map_key; cpu = p->cpu_cnt > 0 ? p->cpus[i] : i; map_key = p->cpu_cnt > 0 ? p->map_keys[i] : i; + /* in case user didn't explicitly requested particular CPUs to + * be attached to, skip offline/not present CPUs + */ + if (p->cpu_cnt <= 0 && (cpu >= n || !online[cpu])) + continue; + cpu_buf = perf_buffer__open_cpu_buf(pb, p->attr, cpu, map_key); if (IS_ERR(cpu_buf)) { err = PTR_ERR(cpu_buf); goto error; } - pb->cpu_bufs[i] = cpu_buf; + pb->cpu_bufs[j] = cpu_buf; err = bpf_map_update_elem(pb->map_fd, &map_key, &cpu_buf->fd, 0); @@ -6178,21 +6192,25 @@ static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, goto error; } - pb->events[i].events = EPOLLIN; - pb->events[i].data.ptr = cpu_buf; + pb->events[j].events = EPOLLIN; + pb->events[j].data.ptr = cpu_buf; if (epoll_ctl(pb->epoll_fd, EPOLL_CTL_ADD, cpu_buf->fd, - &pb->events[i]) < 0) { + &pb->events[j]) < 0) { err = -errno; pr_warn("failed to epoll_ctl cpu #%d perf FD %d: %s\n", cpu, cpu_buf->fd, libbpf_strerror_r(err, msg, sizeof(msg))); goto error; } + j++; } + pb->cpu_cnt = j; + free(online); return pb; error: + free(online); if (pb) perf_buffer__free(pb); return ERR_PTR(err); From 91cbdf740a476cf2c744169bf407de2e3ac1f3cf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 11 Dec 2019 17:36:20 -0800 Subject: [PATCH 020/127] selftests/bpf: Fix perf_buffer test on systems w/ offline CPUs Fix up perf_buffer.c selftest to take into account offline/missing CPUs. Fixes: ee5cf82ce04a ("selftests/bpf: test perf buffer API") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191212013621.1691858-1-andriin@fb.com --- .../selftests/bpf/prog_tests/perf_buffer.c | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c index 3003fddc0613..cf6c87936c69 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c @@ -4,6 +4,7 @@ #include #include #include +#include "libbpf_internal.h" static void on_sample(void *ctx, int cpu, void *data, __u32 size) { @@ -19,7 +20,7 @@ static void on_sample(void *ctx, int cpu, void *data, __u32 size) void test_perf_buffer(void) { - int err, prog_fd, nr_cpus, i, duration = 0; + int err, prog_fd, on_len, nr_on_cpus = 0, nr_cpus, i, duration = 0; const char *prog_name = "kprobe/sys_nanosleep"; const char *file = "./test_perf_buffer.o"; struct perf_buffer_opts pb_opts = {}; @@ -29,15 +30,27 @@ void test_perf_buffer(void) struct bpf_object *obj; struct perf_buffer *pb; struct bpf_link *link; + bool *online; nr_cpus = libbpf_num_possible_cpus(); if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus)) return; + err = parse_cpu_mask_file("/sys/devices/system/cpu/online", + &online, &on_len); + if (CHECK(err, "nr_on_cpus", "err %d\n", err)) + return; + + for (i = 0; i < on_len; i++) + if (online[i]) + nr_on_cpus++; + /* load program */ err = bpf_prog_load(file, BPF_PROG_TYPE_KPROBE, &obj, &prog_fd); - if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno)) - return; + if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno)) { + obj = NULL; + goto out_close; + } prog = bpf_object__find_program_by_title(obj, prog_name); if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name)) @@ -64,6 +77,11 @@ void test_perf_buffer(void) /* trigger kprobe on every CPU */ CPU_ZERO(&cpu_seen); for (i = 0; i < nr_cpus; i++) { + if (i >= on_len || !online[i]) { + printf("skipping offline CPU #%d\n", i); + continue; + } + CPU_ZERO(&cpu_set); CPU_SET(i, &cpu_set); @@ -81,8 +99,8 @@ void test_perf_buffer(void) if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err)) goto out_free_pb; - if (CHECK(CPU_COUNT(&cpu_seen) != nr_cpus, "seen_cpu_cnt", - "expect %d, seen %d\n", nr_cpus, CPU_COUNT(&cpu_seen))) + if (CHECK(CPU_COUNT(&cpu_seen) != nr_on_cpus, "seen_cpu_cnt", + "expect %d, seen %d\n", nr_on_cpus, CPU_COUNT(&cpu_seen))) goto out_free_pb; out_free_pb: @@ -91,4 +109,5 @@ out_detach: bpf_link__destroy(link); out_close: bpf_object__close(obj); + free(online); } From 98e8627efcada18ac043a77b9101b4b4c768090b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 13 Dec 2019 18:51:07 +0100 Subject: [PATCH 021/127] bpf: Move trampoline JIT image allocation to a function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the image allocation in the BPF trampoline code into a separate function, so it can be shared with the BPF dispatcher in upcoming commits. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-2-bjorn.topel@gmail.com --- include/linux/bpf.h | 1 + kernel/bpf/trampoline.c | 24 +++++++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 35903f148be5..5d744828b399 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -475,6 +475,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key); int bpf_trampoline_link_prog(struct bpf_prog *prog); int bpf_trampoline_unlink_prog(struct bpf_prog *prog); void bpf_trampoline_put(struct bpf_trampoline *tr); +void *bpf_jit_alloc_exec_page(void); #else static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7e89f1f49d77..5ee301ddbd00 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -13,6 +13,22 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +void *bpf_jit_alloc_exec_page(void) +{ + void *image; + + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) + return NULL; + + set_vm_flush_reset_perms(image); + /* Keep image as writeable. The alternative is to keep flipping ro/rw + * everytime new program is attached or detached. + */ + set_memory_x((long)image, 1); + return image; +} + struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; @@ -33,7 +49,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec(PAGE_SIZE); + image = bpf_jit_alloc_exec_page(); if (!image) { kfree(tr); tr = NULL; @@ -47,12 +63,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); - - set_vm_flush_reset_perms(image); - /* Keep image as writeable. The alternative is to keep flipping ro/rw - * everytime new program is attached or detached. - */ - set_memory_x((long)image, 1); tr->image = image; out: mutex_unlock(&trampoline_mutex); From 75ccbef6369e94ecac696a152a998a978d41376b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 13 Dec 2019 18:51:08 +0100 Subject: [PATCH 022/127] bpf: Introduce BPF dispatcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BPF dispatcher is a multi-way branch code generator, mainly targeted for XDP programs. When an XDP program is executed via the bpf_prog_run_xdp(), it is invoked via an indirect call. The indirect call has a substantial performance impact, when retpolines are enabled. The dispatcher transform indirect calls to direct calls, and therefore avoids the retpoline. The dispatcher is generated using the BPF JIT, and relies on text poking provided by bpf_arch_text_poke(). The dispatcher hijacks a trampoline function it via the __fentry__ nop of the trampoline. One dispatcher instance currently supports up to 64 dispatch points. A user creates a dispatcher with its corresponding trampoline with the DEFINE_BPF_DISPATCHER macro. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-3-bjorn.topel@gmail.com --- arch/x86/net/bpf_jit_comp.c | 122 ++++++++++++++++++++++++++++ include/linux/bpf.h | 56 +++++++++++++ kernel/bpf/Makefile | 1 + kernel/bpf/dispatcher.c | 158 ++++++++++++++++++++++++++++++++++++ 4 files changed, 337 insertions(+) create mode 100644 kernel/bpf/dispatcher.c diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b8be18427277..3ce7ad41bd6f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -10,10 +10,12 @@ #include #include #include +#include #include #include #include #include +#include static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -1530,6 +1532,126 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags return 0; } +static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) +{ + u8 *prog = *pprog; + int cnt = 0; + s64 offset; + + offset = func - (ip + 2 + 4); + if (!is_simm32(offset)) { + pr_err("Target %p is out of range\n", func); + return -EINVAL; + } + EMIT2_off32(0x0F, jmp_cond + 0x10, offset); + *pprog = prog; + return 0; +} + +static int emit_fallback_jump(u8 **pprog) +{ + u8 *prog = *pprog; + int err = 0; + +#ifdef CONFIG_RETPOLINE + /* Note that this assumes the the compiler uses external + * thunks for indirect calls. Both clang and GCC use the same + * naming convention for external thunks. + */ + err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog); +#else + int cnt = 0; + + EMIT2(0xFF, 0xE2); /* jmp rdx */ +#endif + *pprog = prog; + return err; +} + +static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) +{ + int pivot, err, jg_bytes = 1, cnt = 0; + u8 *jg_reloc, *prog = *pprog; + s64 jg_offset; + + if (a == b) { + /* Leaf node of recursion, i.e. not a range of indices + * anymore. + */ + EMIT1(add_1mod(0x48, BPF_REG_3)); /* cmp rdx,func */ + if (!is_simm32(progs[a])) + return -1; + EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), + progs[a]); + err = emit_cond_near_jump(&prog, /* je func */ + (void *)progs[a], prog, + X86_JE); + if (err) + return err; + + err = emit_fallback_jump(&prog); /* jmp thunk/indirect */ + if (err) + return err; + + *pprog = prog; + return 0; + } + + /* Not a leaf node, so we pivot, and recursively descend into + * the lower and upper ranges. + */ + pivot = (b - a) / 2; + EMIT1(add_1mod(0x48, BPF_REG_3)); /* cmp rdx,func */ + if (!is_simm32(progs[a + pivot])) + return -1; + EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a + pivot]); + + if (pivot > 2) { /* jg upper_part */ + /* Require near jump. */ + jg_bytes = 4; + EMIT2_off32(0x0F, X86_JG + 0x10, 0); + } else { + EMIT2(X86_JG, 0); + } + jg_reloc = prog; + + err = emit_bpf_dispatcher(&prog, a, a + pivot, /* emit lower_part */ + progs); + if (err) + return err; + + jg_offset = prog - jg_reloc; + emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes); + + err = emit_bpf_dispatcher(&prog, a + pivot + 1, /* emit upper_part */ + b, progs); + if (err) + return err; + + *pprog = prog; + return 0; +} + +static int cmp_ips(const void *a, const void *b) +{ + const s64 *ipa = a; + const s64 *ipb = b; + + if (*ipa > *ipb) + return 1; + if (*ipa < *ipb) + return -1; + return 0; +} + +int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +{ + u8 *prog = image; + + sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL); + return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs); +} + struct x64_jit_data { struct bpf_binary_header *header; int *addrs; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5d744828b399..53ae4a50abe4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -470,12 +470,61 @@ struct bpf_trampoline { void *image; u64 selector; }; + +#define BPF_DISPATCHER_MAX 64 /* Fits in 2048B */ + +struct bpf_dispatcher_prog { + struct bpf_prog *prog; + refcount_t users; +}; + +struct bpf_dispatcher { + /* dispatcher mutex */ + struct mutex mutex; + void *func; + struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX]; + int num_progs; + void *image; + u32 image_off; +}; + #ifdef CONFIG_BPF_JIT struct bpf_trampoline *bpf_trampoline_lookup(u64 key); int bpf_trampoline_link_prog(struct bpf_prog *prog); int bpf_trampoline_unlink_prog(struct bpf_prog *prog); void bpf_trampoline_put(struct bpf_trampoline *tr); void *bpf_jit_alloc_exec_page(void); +#define BPF_DISPATCHER_INIT(name) { \ + .mutex = __MUTEX_INITIALIZER(name.mutex), \ + .func = &name##func, \ + .progs = {}, \ + .num_progs = 0, \ + .image = NULL, \ + .image_off = 0 \ +} + +#define DEFINE_BPF_DISPATCHER(name) \ + noinline unsigned int name##func( \ + const void *ctx, \ + const struct bpf_insn *insnsi, \ + unsigned int (*bpf_func)(const void *, \ + const struct bpf_insn *)) \ + { \ + return bpf_func(ctx, insnsi); \ + } \ + EXPORT_SYMBOL(name##func); \ + struct bpf_dispatcher name = BPF_DISPATCHER_INIT(name); +#define DECLARE_BPF_DISPATCHER(name) \ + unsigned int name##func( \ + const void *ctx, \ + const struct bpf_insn *insnsi, \ + unsigned int (*bpf_func)(const void *, \ + const struct bpf_insn *)); \ + extern struct bpf_dispatcher name; +#define BPF_DISPATCHER_FUNC(name) name##func +#define BPF_DISPATCHER_PTR(name) (&name) +void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, + struct bpf_prog *to); #else static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { @@ -490,6 +539,13 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) return -ENOTSUPP; } static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} +#define DEFINE_BPF_DISPATCHER(name) +#define DECLARE_BPF_DISPATCHER(name) +#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nopfunc +#define BPF_DISPATCHER_PTR(name) NULL +static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, + struct bpf_prog *from, + struct bpf_prog *to) {} #endif struct bpf_func_info_aux { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3f671bf617e8..d4f330351f87 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c new file mode 100644 index 000000000000..204ee61a3904 --- /dev/null +++ b/kernel/bpf/dispatcher.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2019 Intel Corporation. */ + +#include +#include +#include + +/* The BPF dispatcher is a multiway branch code generator. The + * dispatcher is a mechanism to avoid the performance penalty of an + * indirect call, which is expensive when retpolines are enabled. A + * dispatch client registers a BPF program into the dispatcher, and if + * there is available room in the dispatcher a direct call to the BPF + * program will be generated. All calls to the BPF programs called via + * the dispatcher will then be a direct call, instead of an + * indirect. The dispatcher hijacks a trampoline function it via the + * __fentry__ of the trampoline. The trampoline function has the + * following signature: + * + * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi, + * unsigned int (*bpf_func)(const void *, + * const struct bpf_insn *)); + */ + +static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog( + struct bpf_dispatcher *d, struct bpf_prog *prog) +{ + int i; + + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { + if (prog == d->progs[i].prog) + return &d->progs[i]; + } + return NULL; +} + +static struct bpf_dispatcher_prog *bpf_dispatcher_find_free( + struct bpf_dispatcher *d) +{ + return bpf_dispatcher_find_prog(d, NULL); +} + +static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d, + struct bpf_prog *prog) +{ + struct bpf_dispatcher_prog *entry; + + if (!prog) + return false; + + entry = bpf_dispatcher_find_prog(d, prog); + if (entry) { + refcount_inc(&entry->users); + return false; + } + + entry = bpf_dispatcher_find_free(d); + if (!entry) + return false; + + bpf_prog_inc(prog); + entry->prog = prog; + refcount_set(&entry->users, 1); + d->num_progs++; + return true; +} + +static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, + struct bpf_prog *prog) +{ + struct bpf_dispatcher_prog *entry; + + if (!prog) + return false; + + entry = bpf_dispatcher_find_prog(d, prog); + if (!entry) + return false; + + if (refcount_dec_and_test(&entry->users)) { + entry->prog = NULL; + bpf_prog_put(prog); + d->num_progs--; + return true; + } + return false; +} + +int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +{ + return -ENOTSUPP; +} + +static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) +{ + s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; + int i; + + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { + if (d->progs[i].prog) + *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; + } + return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs); +} + +static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) +{ + void *old, *new; + u32 noff; + int err; + + if (!prev_num_progs) { + old = NULL; + noff = 0; + } else { + old = d->image + d->image_off; + noff = d->image_off ^ (PAGE_SIZE / 2); + } + + new = d->num_progs ? d->image + noff : NULL; + if (new) { + if (bpf_dispatcher_prepare(d, new)) + return; + } + + err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new); + if (err || !new) + return; + + d->image_off = noff; +} + +void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, + struct bpf_prog *to) +{ + bool changed = false; + int prev_num_progs; + + if (from == to) + return; + + mutex_lock(&d->mutex); + if (!d->image) { + d->image = bpf_jit_alloc_exec_page(); + if (!d->image) + goto out; + } + + prev_num_progs = d->num_progs; + changed |= bpf_dispatcher_remove_prog(d, from); + changed |= bpf_dispatcher_add_prog(d, to); + + if (!changed) + goto out; + + bpf_dispatcher_update(d, prev_num_progs); +out: + mutex_unlock(&d->mutex); +} From 7e6897f95935973c3253fd756135b5ea58043dc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 13 Dec 2019 18:51:09 +0100 Subject: [PATCH 023/127] bpf, xdp: Start using the BPF dispatcher for XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds a BPF dispatcher for XDP. The dispatcher is updated from the XDP control-path, dev_xdp_install(), and used when an XDP program is run via bpf_prog_run_xdp(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-4-bjorn.topel@gmail.com --- include/linux/bpf.h | 15 +++++++++++++++ include/linux/filter.h | 40 ++++++++++++++++++++++++---------------- kernel/bpf/syscall.c | 26 ++++++++++++++++++-------- net/core/dev.c | 19 ++++++++++++++++++- net/core/filter.c | 8 ++++++++ 5 files changed, 83 insertions(+), 25 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 53ae4a50abe4..5970989b99d1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,6 +488,14 @@ struct bpf_dispatcher { u32 image_off; }; +static __always_inline unsigned int bpf_dispatcher_nopfunc( + const void *ctx, + const struct bpf_insn *insnsi, + unsigned int (*bpf_func)(const void *, + const struct bpf_insn *)) +{ + return bpf_func(ctx, insnsi); +} #ifdef CONFIG_BPF_JIT struct bpf_trampoline *bpf_trampoline_lookup(u64 key); int bpf_trampoline_link_prog(struct bpf_prog *prog); @@ -997,6 +1005,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog); +struct bpf_prog *bpf_prog_by_id(u32 id); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1128,6 +1138,11 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, static inline void bpf_map_put(struct bpf_map *map) { } + +static inline struct bpf_prog *bpf_prog_by_id(u32 id) +{ + return ERR_PTR(-ENOTSUPP); +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, diff --git a/include/linux/filter.h b/include/linux/filter.h index a141cb07e76a..37ac7025031d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -559,23 +559,26 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); -#define BPF_PROG_RUN(prog, ctx) ({ \ - u32 ret; \ - cant_sleep(); \ - if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ - struct bpf_prog_stats *stats; \ - u64 start = sched_clock(); \ - ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ - stats = this_cpu_ptr(prog->aux->stats); \ - u64_stats_update_begin(&stats->syncp); \ - stats->cnt++; \ - stats->nsecs += sched_clock() - start; \ - u64_stats_update_end(&stats->syncp); \ - } else { \ - ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ - } \ +#define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ + u32 ret; \ + cant_sleep(); \ + if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ + struct bpf_prog_stats *stats; \ + u64 start = sched_clock(); \ + ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + stats = this_cpu_ptr(prog->aux->stats); \ + u64_stats_update_begin(&stats->syncp); \ + stats->cnt++; \ + stats->nsecs += sched_clock() - start; \ + u64_stats_update_end(&stats->syncp); \ + } else { \ + ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + } \ ret; }) +#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx, \ + bpf_dispatcher_nopfunc) + #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN struct bpf_skb_data_end { @@ -699,6 +702,8 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, return res; } +DECLARE_BPF_DISPATCHER(bpf_dispatcher_xdp) + static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { @@ -708,9 +713,12 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, * already takes rcu_read_lock() when fetching the program, so * it's not necessary here anymore. */ - return BPF_PROG_RUN(prog, xdp); + return __BPF_PROG_RUN(prog, xdp, + BPF_DISPATCHER_FUNC(bpf_dispatcher_xdp)); } +void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); + static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) { return prog->len * sizeof(struct bpf_insn); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 66b90eaf99fe..b08c362f4e02 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2338,6 +2338,23 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id +struct bpf_prog *bpf_prog_by_id(u32 id) +{ + struct bpf_prog *prog; + + if (!id) + return ERR_PTR(-ENOENT); + + spin_lock_bh(&prog_idr_lock); + prog = idr_find(&prog_idr, id); + if (prog) + prog = bpf_prog_inc_not_zero(prog); + else + prog = ERR_PTR(-ENOENT); + spin_unlock_bh(&prog_idr_lock); + return prog; +} + static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) { struct bpf_prog *prog; @@ -2350,14 +2367,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - spin_lock_bh(&prog_idr_lock); - prog = idr_find(&prog_idr, id); - if (prog) - prog = bpf_prog_inc_not_zero(prog); - else - prog = ERR_PTR(-ENOENT); - spin_unlock_bh(&prog_idr_lock); - + prog = bpf_prog_by_id(id); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/net/core/dev.c b/net/core/dev.c index 2c277b8aba38..255d3cf35360 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8542,7 +8542,17 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { + bool non_hw = !(flags & XDP_FLAGS_HW_MODE); + struct bpf_prog *prev_prog = NULL; struct netdev_bpf xdp; + int err; + + if (non_hw) { + prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op, + XDP_QUERY_PROG)); + if (IS_ERR(prev_prog)) + prev_prog = NULL; + } memset(&xdp, 0, sizeof(xdp)); if (flags & XDP_FLAGS_HW_MODE) @@ -8553,7 +8563,14 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, xdp.flags = flags; xdp.prog = prog; - return bpf_op(dev, &xdp); + err = bpf_op(dev, &xdp); + if (!err && non_hw) + bpf_prog_change_xdp(prev_prog, prog); + + if (prev_prog) + bpf_prog_put(prev_prog); + + return err; } static void dev_xdp_uninstall(struct net_device *dev) diff --git a/net/core/filter.c b/net/core/filter.c index f1e703eed3d2..a411f7835dee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8940,3 +8940,11 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; #endif /* CONFIG_INET */ + +DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp) + +void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) +{ + bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp), + prev_prog, prog); +} From f23c4b3924d2e9382820ee677b68d42d5dd7b08b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 13 Dec 2019 18:51:10 +0100 Subject: [PATCH 024/127] bpf: Start using the BPF dispatcher in BPF_TEST_RUN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to properly exercise the BPF dispatcher, this commit adds BPF dispatcher usage to BPF_TEST_RUN when executing XDP programs. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-5-bjorn.topel@gmail.com --- net/bpf/test_run.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 85c8cbbada92..5016c538d3ce 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -15,7 +15,7 @@ #include static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, - u32 *retval, u32 *time) + u32 *retval, u32 *time, bool xdp) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; enum bpf_cgroup_storage_type stype; @@ -41,7 +41,11 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { bpf_cgroup_storage_set(storage); - *retval = BPF_PROG_RUN(prog, ctx); + + if (xdp) + *retval = bpf_prog_run_xdp(prog, ctx); + else + *retval = BPF_PROG_RUN(prog, ctx); if (signal_pending(current)) { ret = -EINTR; @@ -356,7 +360,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, ret = convert___skb_to_skb(skb, ctx); if (ret) goto out; - ret = bpf_test_run(prog, skb, repeat, &retval, &duration); + ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false); if (ret) goto out; if (!is_l2) { @@ -413,8 +417,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; - - ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration); + bpf_prog_change_xdp(NULL, prog); + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) goto out; if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || @@ -422,6 +426,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); out: + bpf_prog_change_xdp(prog, NULL); kfree(data); return ret; } From e754f5a6e36b63d6732f52adcbe4c447fd66896f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 13 Dec 2019 18:51:11 +0100 Subject: [PATCH 025/127] selftests: bpf: Add xdp_perf test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xdp_perf is a dummy XDP test, only used to measure the the cost of jumping into a naive XDP program one million times. To build and run the program: $ cd tools/testing/selftests/bpf $ make $ ./test_progs -v -t xdp_perf Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-6-bjorn.topel@gmail.com --- .../selftests/bpf/prog_tests/xdp_perf.c | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_perf.c diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c new file mode 100644 index 000000000000..7185bee16fe4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +void test_xdp_perf(void) +{ + const char *file = "./xdp_dummy.o"; + __u32 duration, retval, size; + struct bpf_object *obj; + char in[128], out[128]; + int err, prog_fd; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + err = bpf_prog_test_run(prog_fd, 1000000, &in[0], 128, + out, &size, &retval, &duration); + + CHECK(err || retval != XDP_PASS || size != 128, + "xdp-perf", + "err %d errno %d retval %d size %d\n", + err, errno, retval, size); + + bpf_object__close(obj); +} From 116eb788f57c9c35c40b29cfaa2607020de99a84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 13 Dec 2019 18:51:12 +0100 Subject: [PATCH 026/127] bpf, x86: Align dispatcher branch targets to 16B MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit >From Intel 64 and IA-32 Architectures Optimization Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler Coding Rule 11: All branch targets should be 16-byte aligned. This commits aligns branch targets according to the Intel manual. The nops used to align branch targets make the dispatcher larger, and therefore the number of supported dispatch points/programs are descreased from 64 to 48. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-7-bjorn.topel@gmail.com --- arch/x86/net/bpf_jit_comp.c | 30 +++++++++++++++++++++++++++++- include/linux/bpf.h | 2 +- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 3ce7ad41bd6f..4c8a2d1f8470 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1548,6 +1548,26 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) return 0; } +static void emit_nops(u8 **pprog, unsigned int len) +{ + unsigned int i, noplen; + u8 *prog = *pprog; + int cnt = 0; + + while (len > 0) { + noplen = len; + + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + + for (i = 0; i < noplen; i++) + EMIT1(ideal_nops[noplen][i]); + len -= noplen; + } + + *pprog = prog; +} + static int emit_fallback_jump(u8 **pprog) { u8 *prog = *pprog; @@ -1570,8 +1590,8 @@ static int emit_fallback_jump(u8 **pprog) static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) { + u8 *jg_reloc, *jg_target, *prog = *pprog; int pivot, err, jg_bytes = 1, cnt = 0; - u8 *jg_reloc, *prog = *pprog; s64 jg_offset; if (a == b) { @@ -1620,6 +1640,14 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) if (err) return err; + /* From Intel 64 and IA-32 Architectures Optimization + * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler + * Coding Rule 11: All branch targets should be 16-byte + * aligned. + */ + jg_target = PTR_ALIGN(prog, 16); + if (jg_target != prog) + emit_nops(&prog, jg_target - prog); jg_offset = prog - jg_reloc; emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5970989b99d1..d467983e61bb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -471,7 +471,7 @@ struct bpf_trampoline { u64 selector; }; -#define BPF_DISPATCHER_MAX 64 /* Fits in 2048B */ +#define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */ struct bpf_dispatcher_prog { struct bpf_prog *prog; From 850a88cc4096fe1df407452ba2e4d28cf5b3eee9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 13 Dec 2019 14:30:27 -0800 Subject: [PATCH 027/127] bpf: Expose __sk_buff wire_len/gso_segs to BPF_PROG_TEST_RUN wire_len should not be less than real len and is capped by GSO_MAX_SIZE. gso_segs is capped by GSO_MAX_SEGS. v2: * set wire_len to skb->len when passed wire_len is 0 (Alexei Starovoitov) Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191213223028.161282-1-sdf@google.com --- net/bpf/test_run.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 5016c538d3ce..93a9c87787e0 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -267,8 +267,10 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) return -EINVAL; /* tstamp is allowed */ + /* wire_len is allowed */ + /* gso_segs is allowed */ - if (!range_is_zero(__skb, offsetofend(struct __sk_buff, tstamp), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), sizeof(struct __sk_buff))) return -EINVAL; @@ -276,6 +278,19 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); + if (__skb->wire_len == 0) { + cb->pkt_len = skb->len; + } else { + if (__skb->wire_len < skb->len || + __skb->wire_len > GSO_MAX_SIZE) + return -EINVAL; + cb->pkt_len = __skb->wire_len; + } + + if (__skb->gso_segs > GSO_MAX_SEGS) + return -EINVAL; + skb_shinfo(skb)->gso_segs = __skb->gso_segs; + return 0; } @@ -289,6 +304,8 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) __skb->priority = skb->priority; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); + __skb->wire_len = cb->pkt_len; + __skb->gso_segs = skb_shinfo(skb)->gso_segs; } int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, From a06bf42f5a95ff462401d59d0c08cf4620213647 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 13 Dec 2019 14:30:28 -0800 Subject: [PATCH 028/127] selftests/bpf: Test wire_len/gso_segs in BPF_PROG_TEST_RUN Make sure we can pass arbitrary data in wire_len/gso_segs. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191213223028.161282-2-sdf@google.com --- tools/testing/selftests/bpf/prog_tests/skb_ctx.c | 2 ++ tools/testing/selftests/bpf/progs/test_skb_ctx.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c index a2eb8db8dafb..edf5e8c7d400 100644 --- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c +++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c @@ -11,6 +11,8 @@ void test_skb_ctx(void) .cb[4] = 5, .priority = 6, .tstamp = 7, + .wire_len = 100, + .gso_segs = 8, }; struct bpf_prog_test_run_attr tattr = { .data_in = &pkt_v4, diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c index 2a9f4c736ebc..534fbf9a7344 100644 --- a/tools/testing/selftests/bpf/progs/test_skb_ctx.c +++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c @@ -18,5 +18,10 @@ int process(struct __sk_buff *skb) skb->priority++; skb->tstamp++; + if (skb->wire_len != 100) + return 1; + if (skb->gso_segs != 8) + return 1; + return 0; } From ec2025095cf6acda3233a4301809596938b47da5 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 13 Dec 2019 20:10:04 +0100 Subject: [PATCH 029/127] bpftool: Match several programs with same tag When several BPF programs have the same tag, bpftool matches only the first (in ID order). This patch changes that behavior such that dump and show commands return all matched programs. Commands that require a single program (e.g., pin and attach) will error out if given a tag that matches several. bpftool prog dump will also error out if file or visual are given and several programs have the given tag. In the case of the dump command, a program header is added before each dump only if the tag matches several programs; this patch doesn't change the output if a single program matches. The output when several programs match thus looks as follows. $ ./bpftool prog dump xlated tag 6deef7357e7b4530 3: cgroup_skb tag 6deef7357e7b4530 gpl 0: (bf) r6 = r1 [...] 7: (95) exit 4: cgroup_skb tag 6deef7357e7b4530 gpl 0: (bf) r6 = r1 [...] 7: (95) exit Signed-off-by: Paul Chaignon Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/fb1fe943202659a69cd21dd5b907c205af1e1e22.1576263640.git.paul.chaignon@gmail.com --- .../bpftool/Documentation/bpftool-prog.rst | 16 +- tools/bpf/bpftool/prog.c | 372 ++++++++++++------ 2 files changed, 268 insertions(+), 120 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 7a374b3c851d..d377d0cb7923 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -53,8 +53,10 @@ DESCRIPTION =========== **bpftool prog { show | list }** [*PROG*] Show information about loaded programs. If *PROG* is - specified show information only about given program, otherwise - list all programs currently loaded on the system. + specified show information only about given programs, + otherwise list all programs currently loaded on the system. + In case of **tag**, *PROG* may match several programs which + will all be shown. Output will start with program ID followed by program type and zero or more named attributes (depending on kernel version). @@ -68,11 +70,15 @@ DESCRIPTION performed via the **kernel.bpf_stats_enabled** sysctl knob. **bpftool prog dump xlated** *PROG* [{ **file** *FILE* | **opcodes** | **visual** | **linum** }] - Dump eBPF instructions of the program from the kernel. By + Dump eBPF instructions of the programs from the kernel. By default, eBPF will be disassembled and printed to standard output in human-readable format. In this case, **opcodes** controls if raw opcodes should be printed as well. + In case of **tag**, *PROG* may match several programs which + will all be dumped. However, if **file** or **visual** is + specified, *PROG* must match a single program. + If **file** is specified, the binary image will instead be written to *FILE*. @@ -80,15 +86,17 @@ DESCRIPTION built instead, and eBPF instructions will be presented with CFG in DOT format, on standard output. - If the prog has line_info available, the source line will + If the programs have line_info available, the source line will be displayed by default. If **linum** is specified, the filename, line number and line column will also be displayed on top of the source line. **bpftool prog dump jited** *PROG* [{ **file** *FILE* | **opcodes** | **linum** }] Dump jited image (host machine code) of the program. + If *FILE* is specified image will be written to a file, otherwise it will be disassembled and printed to stdout. + *PROG* must match a single program when **file** is specified. **opcodes** controls if raw opcodes will be printed. diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 4535c863d2cd..37948c47aabd 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -25,6 +25,11 @@ #include "main.h" #include "xlated_dumper.h" +enum dump_mode { + DUMP_JITED, + DUMP_XLATED, +}; + static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", @@ -77,11 +82,12 @@ static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) strftime(buf, size, "%FT%T%z", &load_tm); } -static int prog_fd_by_tag(unsigned char *tag) +static int prog_fd_by_tag(unsigned char *tag, int **fds) { unsigned int id = 0; + int fd, nb_fds = 0; + void *tmp; int err; - int fd; while (true) { struct bpf_prog_info info = {}; @@ -89,36 +95,53 @@ static int prog_fd_by_tag(unsigned char *tag) err = bpf_prog_get_next_id(id, &id); if (err) { - p_err("%s", strerror(errno)); - return -1; + if (errno != ENOENT) { + p_err("%s", strerror(errno)); + goto err_close_fds; + } + return nb_fds; } fd = bpf_prog_get_fd_by_id(id); if (fd < 0) { p_err("can't get prog by id (%u): %s", id, strerror(errno)); - return -1; + goto err_close_fds; } err = bpf_obj_get_info_by_fd(fd, &info, &len); if (err) { p_err("can't get prog info (%u): %s", id, strerror(errno)); - close(fd); - return -1; + goto err_close_fd; } - if (!memcmp(tag, info.tag, BPF_TAG_SIZE)) - return fd; + if (memcmp(tag, info.tag, BPF_TAG_SIZE)) { + close(fd); + continue; + } - close(fd); + if (nb_fds > 0) { + tmp = realloc(*fds, (nb_fds + 1) * sizeof(int)); + if (!tmp) { + p_err("failed to realloc"); + goto err_close_fd; + } + *fds = tmp; + } + (*fds)[nb_fds++] = fd; } + +err_close_fd: + close(fd); +err_close_fds: + while (--nb_fds >= 0) + close((*fds)[nb_fds]); + return -1; } -int prog_parse_fd(int *argc, char ***argv) +static int prog_parse_fds(int *argc, char ***argv, int **fds) { - int fd; - if (is_prefix(**argv, "id")) { unsigned int id; char *endptr; @@ -132,10 +155,12 @@ int prog_parse_fd(int *argc, char ***argv) } NEXT_ARGP(); - fd = bpf_prog_get_fd_by_id(id); - if (fd < 0) + (*fds)[0] = bpf_prog_get_fd_by_id(id); + if ((*fds)[0] < 0) { p_err("get by id (%u): %s", id, strerror(errno)); - return fd; + return -1; + } + return 1; } else if (is_prefix(**argv, "tag")) { unsigned char tag[BPF_TAG_SIZE]; @@ -149,7 +174,7 @@ int prog_parse_fd(int *argc, char ***argv) } NEXT_ARGP(); - return prog_fd_by_tag(tag); + return prog_fd_by_tag(tag, fds); } else if (is_prefix(**argv, "pinned")) { char *path; @@ -158,13 +183,43 @@ int prog_parse_fd(int *argc, char ***argv) path = **argv; NEXT_ARGP(); - return open_obj_pinned_any(path, BPF_OBJ_PROG); + (*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_PROG); + if ((*fds)[0] < 0) + return -1; + return 1; } p_err("expected 'id', 'tag' or 'pinned', got: '%s'?", **argv); return -1; } +int prog_parse_fd(int *argc, char ***argv) +{ + int *fds = NULL; + int nb_fds, fd; + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = prog_parse_fds(argc, argv, &fds); + if (nb_fds != 1) { + if (nb_fds > 1) { + p_err("several programs match this handle"); + while (nb_fds--) + close(fds[nb_fds]); + } + fd = -1; + goto exit_free; + } + + fd = fds[0]; +exit_free: + free(fds); + return fd; +} + static void show_prog_maps(int fd, u32 num_maps) { struct bpf_prog_info info = {}; @@ -194,11 +249,8 @@ static void show_prog_maps(int fd, u32 num_maps) } } -static void print_prog_json(struct bpf_prog_info *info, int fd) +static void print_prog_header_json(struct bpf_prog_info *info) { - char *memlock; - - jsonw_start_object(json_wtr); jsonw_uint_field(json_wtr, "id", info->id); if (info->type < ARRAY_SIZE(prog_type_name)) jsonw_string_field(json_wtr, "type", @@ -219,7 +271,14 @@ static void print_prog_json(struct bpf_prog_info *info, int fd) jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns); jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt); } +} +static void print_prog_json(struct bpf_prog_info *info, int fd) +{ + char *memlock; + + jsonw_start_object(json_wtr); + print_prog_header_json(info); print_dev_json(info->ifindex, info->netns_dev, info->netns_ino); if (info->load_time) { @@ -268,10 +327,8 @@ static void print_prog_json(struct bpf_prog_info *info, int fd) jsonw_end_object(json_wtr); } -static void print_prog_plain(struct bpf_prog_info *info, int fd) +static void print_prog_header_plain(struct bpf_prog_info *info) { - char *memlock; - printf("%u: ", info->id); if (info->type < ARRAY_SIZE(prog_type_name)) printf("%s ", prog_type_name[info->type]); @@ -289,6 +346,13 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd) printf(" run_time_ns %lld run_cnt %lld", info->run_time_ns, info->run_cnt); printf("\n"); +} + +static void print_prog_plain(struct bpf_prog_info *info, int fd) +{ + char *memlock; + + print_prog_header_plain(info); if (info->load_time) { char buf[32]; @@ -349,6 +413,40 @@ static int show_prog(int fd) return 0; } +static int do_show_subset(int argc, char **argv) +{ + int *fds = NULL; + int nb_fds, i; + int err = -1; + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = prog_parse_fds(&argc, &argv, &fds); + if (nb_fds < 1) + goto exit_free; + + if (json_output && nb_fds > 1) + jsonw_start_array(json_wtr); /* root array */ + for (i = 0; i < nb_fds; i++) { + err = show_prog(fds[i]); + if (err) { + for (; i < nb_fds; i++) + close(fds[i]); + break; + } + close(fds[i]); + } + if (json_output && nb_fds > 1) + jsonw_end_array(json_wtr); /* root array */ + +exit_free: + free(fds); + return err; +} + static int do_show(int argc, char **argv) { __u32 id = 0; @@ -358,15 +456,8 @@ static int do_show(int argc, char **argv) if (show_pinned) build_pinned_obj_table(&prog_table, BPF_OBJ_PROG); - if (argc == 2) { - fd = prog_parse_fd(&argc, &argv); - if (fd < 0) - return -1; - - err = show_prog(fd); - close(fd); - return err; - } + if (argc == 2) + return do_show_subset(argc, argv); if (argc) return BAD_ARG(); @@ -408,101 +499,32 @@ static int do_show(int argc, char **argv) return err; } -static int do_dump(int argc, char **argv) +static int +prog_dump(struct bpf_prog_info *info, enum dump_mode mode, + char *filepath, bool opcodes, bool visual, bool linum) { - struct bpf_prog_info_linear *info_linear; struct bpf_prog_linfo *prog_linfo = NULL; - enum {DUMP_JITED, DUMP_XLATED} mode; const char *disasm_opt = NULL; - struct bpf_prog_info *info; struct dump_data dd = {}; void *func_info = NULL; struct btf *btf = NULL; - char *filepath = NULL; - bool opcodes = false; - bool visual = false; char func_sig[1024]; unsigned char *buf; - bool linum = false; __u32 member_len; - __u64 arrays; ssize_t n; int fd; - if (is_prefix(*argv, "jited")) { - if (disasm_init()) - return -1; - mode = DUMP_JITED; - } else if (is_prefix(*argv, "xlated")) { - mode = DUMP_XLATED; - } else { - p_err("expected 'xlated' or 'jited', got: %s", *argv); - return -1; - } - NEXT_ARG(); - - if (argc < 2) - usage(); - - fd = prog_parse_fd(&argc, &argv); - if (fd < 0) - return -1; - - if (is_prefix(*argv, "file")) { - NEXT_ARG(); - if (!argc) { - p_err("expected file path"); - return -1; - } - - filepath = *argv; - NEXT_ARG(); - } else if (is_prefix(*argv, "opcodes")) { - opcodes = true; - NEXT_ARG(); - } else if (is_prefix(*argv, "visual")) { - visual = true; - NEXT_ARG(); - } else if (is_prefix(*argv, "linum")) { - linum = true; - NEXT_ARG(); - } - - if (argc) { - usage(); - return -1; - } - - if (mode == DUMP_JITED) - arrays = 1UL << BPF_PROG_INFO_JITED_INSNS; - else - arrays = 1UL << BPF_PROG_INFO_XLATED_INSNS; - - arrays |= 1UL << BPF_PROG_INFO_JITED_KSYMS; - arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS; - arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO; - arrays |= 1UL << BPF_PROG_INFO_LINE_INFO; - arrays |= 1UL << BPF_PROG_INFO_JITED_LINE_INFO; - - info_linear = bpf_program__get_prog_info_linear(fd, arrays); - close(fd); - if (IS_ERR_OR_NULL(info_linear)) { - p_err("can't get prog info: %s", strerror(errno)); - return -1; - } - - info = &info_linear->info; if (mode == DUMP_JITED) { if (info->jited_prog_len == 0) { p_info("no instructions returned"); - goto err_free; + return -1; } buf = (unsigned char *)(info->jited_prog_insns); member_len = info->jited_prog_len; } else { /* DUMP_XLATED */ if (info->xlated_prog_len == 0) { p_err("error retrieving insn dump: kernel.kptr_restrict set?"); - goto err_free; + return -1; } buf = (unsigned char *)info->xlated_prog_insns; member_len = info->xlated_prog_len; @@ -510,7 +532,7 @@ static int do_dump(int argc, char **argv) if (info->btf_id && btf__get_from_id(info->btf_id, &btf)) { p_err("failed to get btf"); - goto err_free; + return -1; } func_info = (void *)info->func_info; @@ -526,7 +548,7 @@ static int do_dump(int argc, char **argv) if (fd < 0) { p_err("can't open file %s: %s", filepath, strerror(errno)); - goto err_free; + return -1; } n = write(fd, buf, member_len); @@ -534,7 +556,7 @@ static int do_dump(int argc, char **argv) if (n != member_len) { p_err("error writing output file: %s", n < 0 ? strerror(errno) : "short write"); - goto err_free; + return -1; } if (json_output) @@ -548,7 +570,7 @@ static int do_dump(int argc, char **argv) info->netns_ino, &disasm_opt); if (!name) - goto err_free; + return -1; } if (info->nr_jited_func_lens && info->jited_func_lens) { @@ -643,12 +665,130 @@ static int do_dump(int argc, char **argv) kernel_syms_destroy(&dd); } - free(info_linear); return 0; +} -err_free: - free(info_linear); - return -1; +static int do_dump(int argc, char **argv) +{ + struct bpf_prog_info_linear *info_linear; + char *filepath = NULL; + bool opcodes = false; + bool visual = false; + enum dump_mode mode; + bool linum = false; + int *fds = NULL; + int nb_fds, i = 0; + int err = -1; + __u64 arrays; + + if (is_prefix(*argv, "jited")) { + if (disasm_init()) + return -1; + mode = DUMP_JITED; + } else if (is_prefix(*argv, "xlated")) { + mode = DUMP_XLATED; + } else { + p_err("expected 'xlated' or 'jited', got: %s", *argv); + return -1; + } + NEXT_ARG(); + + if (argc < 2) + usage(); + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = prog_parse_fds(&argc, &argv, &fds); + if (nb_fds < 1) + goto exit_free; + + if (is_prefix(*argv, "file")) { + NEXT_ARG(); + if (!argc) { + p_err("expected file path"); + goto exit_close; + } + if (nb_fds > 1) { + p_err("several programs matched"); + goto exit_close; + } + + filepath = *argv; + NEXT_ARG(); + } else if (is_prefix(*argv, "opcodes")) { + opcodes = true; + NEXT_ARG(); + } else if (is_prefix(*argv, "visual")) { + if (nb_fds > 1) { + p_err("several programs matched"); + goto exit_close; + } + + visual = true; + NEXT_ARG(); + } else if (is_prefix(*argv, "linum")) { + linum = true; + NEXT_ARG(); + } + + if (argc) { + usage(); + goto exit_close; + } + + if (mode == DUMP_JITED) + arrays = 1UL << BPF_PROG_INFO_JITED_INSNS; + else + arrays = 1UL << BPF_PROG_INFO_XLATED_INSNS; + + arrays |= 1UL << BPF_PROG_INFO_JITED_KSYMS; + arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS; + arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO; + arrays |= 1UL << BPF_PROG_INFO_LINE_INFO; + arrays |= 1UL << BPF_PROG_INFO_JITED_LINE_INFO; + + if (json_output && nb_fds > 1) + jsonw_start_array(json_wtr); /* root array */ + for (i = 0; i < nb_fds; i++) { + info_linear = bpf_program__get_prog_info_linear(fds[i], arrays); + if (IS_ERR_OR_NULL(info_linear)) { + p_err("can't get prog info: %s", strerror(errno)); + break; + } + + if (json_output && nb_fds > 1) { + jsonw_start_object(json_wtr); /* prog object */ + print_prog_header_json(&info_linear->info); + jsonw_name(json_wtr, "insns"); + } else if (nb_fds > 1) { + print_prog_header_plain(&info_linear->info); + } + + err = prog_dump(&info_linear->info, mode, filepath, opcodes, + visual, linum); + + if (json_output && nb_fds > 1) + jsonw_end_object(json_wtr); /* prog object */ + else if (i != nb_fds - 1 && nb_fds > 1) + printf("\n"); + + free(info_linear); + if (err) + break; + close(fds[i]); + } + if (json_output && nb_fds > 1) + jsonw_end_array(json_wtr); /* root array */ + +exit_close: + for (; i < nb_fds; i++) + close(fds[i]); +exit_free: + free(fds); + return err; } static int do_pin(int argc, char **argv) From a7d22ca2a483d6c69c0791954447464297315ffa Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 13 Dec 2019 20:10:17 +0100 Subject: [PATCH 030/127] bpftool: Match programs by name When working with frequently modified BPF programs, both the ID and the tag may change. bpftool currently doesn't provide a "stable" way to match such programs. This patch implements lookup by name for programs. The show and dump commands will return all programs with the given name, whereas other commands will error out if several programs have the same name. Signed-off-by: Paul Chaignon Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/b5fc1a5dcfaeb5f16fc80295cdaa606dd2d91534.1576263640.git.paul.chaignon@gmail.com --- .../bpf/bpftool/Documentation/bpftool-map.rst | 2 +- .../bpftool/Documentation/bpftool-prog.rst | 12 +++++----- tools/bpf/bpftool/bash-completion/bpftool | 22 ++++++++++++++----- tools/bpf/bpftool/main.h | 2 +- tools/bpf/bpftool/prog.c | 22 +++++++++++++++---- 5 files changed, 42 insertions(+), 18 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 1c0f7146aab0..90d848b5b7d3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -41,7 +41,7 @@ MAP COMMANDS | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } | *DATA* := { [**hex**] *BYTES* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } | *VALUE* := { *DATA* | *MAP* | *PROG* } | *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } | *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index d377d0cb7923..64ddf8a4c518 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -33,7 +33,7 @@ PROG COMMANDS | **bpftool** **prog help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } | *TYPE* := { | **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | | **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | @@ -55,8 +55,8 @@ DESCRIPTION Show information about loaded programs. If *PROG* is specified show information only about given programs, otherwise list all programs currently loaded on the system. - In case of **tag**, *PROG* may match several programs which - will all be shown. + In case of **tag** or **name**, *PROG* may match several + programs which will all be shown. Output will start with program ID followed by program type and zero or more named attributes (depending on kernel version). @@ -75,9 +75,9 @@ DESCRIPTION output in human-readable format. In this case, **opcodes** controls if raw opcodes should be printed as well. - In case of **tag**, *PROG* may match several programs which - will all be dumped. However, if **file** or **visual** is - specified, *PROG* must match a single program. + In case of **tag** or **name**, *PROG* may match several + programs which will all be dumped. However, if **file** or + **visual** is specified, *PROG* must match a single program. If **file** is specified, the binary image will instead be written to *FILE*. diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 70493a6da206..05b5be4a6ef9 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -71,6 +71,12 @@ _bpftool_get_prog_tags() command sed -n 's/.*"tag": "\(.*\)",$/\1/p' )" -- "$cur" ) ) } +_bpftool_get_prog_names() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \ + command sed -n 's/.*"name": "\(.*\)",$/\1/p' )" -- "$cur" ) ) +} + _bpftool_get_btf_ids() { COMPREPLY+=( $( compgen -W "$( bpftool -jp btf 2>&1 | \ @@ -201,6 +207,10 @@ _bpftool() _bpftool_get_prog_tags return 0 ;; + name) + _bpftool_get_prog_names + return 0 + ;; dev) _sysfs_get_netdevs return 0 @@ -263,7 +273,7 @@ _bpftool() ;; esac - local PROG_TYPE='id pinned tag' + local PROG_TYPE='id pinned tag name' local MAP_TYPE='id pinned' case $command in show|list) @@ -559,7 +569,7 @@ _bpftool() return 0 ;; prog_array) - local PROG_TYPE='id pinned tag' + local PROG_TYPE='id pinned tag name' COMPREPLY+=( $( compgen -W "$PROG_TYPE" \ -- "$cur" ) ) return 0 @@ -644,7 +654,7 @@ _bpftool() esac ;; btf) - local PROG_TYPE='id pinned tag' + local PROG_TYPE='id pinned tag name' local MAP_TYPE='id pinned' case $command in dump) @@ -735,7 +745,7 @@ _bpftool() connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl \ getsockopt setsockopt' local ATTACH_FLAGS='multi override' - local PROG_TYPE='id pinned tag' + local PROG_TYPE='id pinned tag name' case $prev in $command) _filedir @@ -760,7 +770,7 @@ _bpftool() elif [[ "$command" == "attach" ]]; then # We have an attach type on the command line, # but it is not the previous word, or - # "id|pinned|tag" (we already checked for + # "id|pinned|tag|name" (we already checked for # that). This should only leave the case when # we need attach flags for "attach" commamnd. _bpftool_one_of_list "$ATTACH_FLAGS" @@ -786,7 +796,7 @@ _bpftool() esac ;; net) - local PROG_TYPE='id pinned tag' + local PROG_TYPE='id pinned tag name' local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload' case $command in show|list) diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 2899095f8254..a7ead7bb9447 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -42,7 +42,7 @@ #define BPF_TAG_FMT "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx" #define HELP_SPEC_PROGRAM \ - "PROG := { id PROG_ID | pinned FILE | tag PROG_TAG }" + "PROG := { id PROG_ID | pinned FILE | tag PROG_TAG | name PROG_NAME }" #define HELP_SPEC_OPTIONS \ "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} |\n" \ "\t {-m|--mapcompat} | {-n|--nomount} }" diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 37948c47aabd..2221bae037f1 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -82,7 +82,7 @@ static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) strftime(buf, size, "%FT%T%z", &load_tm); } -static int prog_fd_by_tag(unsigned char *tag, int **fds) +static int prog_fd_by_nametag(void *nametag, int **fds, bool tag) { unsigned int id = 0; int fd, nb_fds = 0; @@ -116,7 +116,8 @@ static int prog_fd_by_tag(unsigned char *tag, int **fds) goto err_close_fd; } - if (memcmp(tag, info.tag, BPF_TAG_SIZE)) { + if ((tag && memcmp(nametag, info.tag, BPF_TAG_SIZE)) || + (!tag && strncmp(nametag, info.name, BPF_OBJ_NAME_LEN))) { close(fd); continue; } @@ -174,7 +175,20 @@ static int prog_parse_fds(int *argc, char ***argv, int **fds) } NEXT_ARGP(); - return prog_fd_by_tag(tag, fds); + return prog_fd_by_nametag(tag, fds, true); + } else if (is_prefix(**argv, "name")) { + char *name; + + NEXT_ARGP(); + + name = **argv; + if (strlen(name) > BPF_OBJ_NAME_LEN - 1) { + p_err("can't parse name"); + return -1; + } + NEXT_ARGP(); + + return prog_fd_by_nametag(name, fds, false); } else if (is_prefix(**argv, "pinned")) { char *path; @@ -189,7 +203,7 @@ static int prog_parse_fds(int *argc, char ***argv, int **fds) return 1; } - p_err("expected 'id', 'tag' or 'pinned', got: '%s'?", **argv); + p_err("expected 'id', 'tag', 'name' or 'pinned', got: '%s'?", **argv); return -1; } From 99f9863a0c45f4e87cb99593015090fdc9f44398 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 13 Dec 2019 20:10:37 +0100 Subject: [PATCH 031/127] bpftool: Match maps by name This patch implements lookup by name for maps and changes the behavior of lookups by tag to be consistent with prog subcommands. Similarly to program subcommands, the show and dump commands will return all maps with the given name (or tag), whereas other commands will error out if several maps have the same name (resp. tag). When a map has BTF info, it is dumped in JSON with available BTF info. This patch requires that all matched maps have BTF info before switching the output format to JSON. Signed-off-by: Paul Chaignon Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/8de1c9f273860b3ea1680502928f4da2336b853e.1576263640.git.paul.chaignon@gmail.com --- .../bpf/bpftool/Documentation/bpftool-map.rst | 10 +- tools/bpf/bpftool/bash-completion/bpftool | 131 +++++- tools/bpf/bpftool/main.h | 2 +- tools/bpf/bpftool/map.c | 384 ++++++++++++++---- 4 files changed, 439 insertions(+), 88 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 90d848b5b7d3..cdeae8ae90ba 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -39,7 +39,7 @@ MAP COMMANDS | **bpftool** **map freeze** *MAP* | **bpftool** **map help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } | *DATA* := { [**hex**] *BYTES* } | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } | *VALUE* := { *DATA* | *MAP* | *PROG* } @@ -55,8 +55,9 @@ DESCRIPTION =========== **bpftool map { show | list }** [*MAP*] Show information about loaded maps. If *MAP* is specified - show information only about given map, otherwise list all - maps currently loaded on the system. + show information only about given maps, otherwise list all + maps currently loaded on the system. In case of **name**, + *MAP* may match several maps which will all be shown. Output will start with map ID followed by map type and zero or more named attributes (depending on kernel version). @@ -66,7 +67,8 @@ DESCRIPTION as *FILE*. **bpftool map dump** *MAP* - Dump all entries in a given *MAP*. + Dump all entries in a given *MAP*. In case of **name**, + *MAP* may match several maps which will all be dumped. **bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] Update map entry for a given *KEY*. diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 05b5be4a6ef9..21c676a1eeb1 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -59,6 +59,21 @@ _bpftool_get_map_ids_for_type() command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) } +_bpftool_get_map_names() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \ + command sed -n 's/.*"name": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + +# Takes map type and adds matching map names to the list of suggestions. +_bpftool_get_map_names_for_type() +{ + local type="$1" + COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \ + command grep -C2 "$type" | \ + command sed -n 's/.*"name": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + _bpftool_get_prog_ids() { COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \ @@ -186,6 +201,52 @@ _bpftool_map_update_get_id() esac } +_bpftool_map_update_get_name() +{ + local command="$1" + + # Is it the map to update, or a map to insert into the map to update? + # Search for "value" keyword. + local idx value + for (( idx=7; idx < ${#words[@]}-1; idx++ )); do + if [[ ${words[idx]} == "value" ]]; then + value=1 + break + fi + done + if [[ $value -eq 0 ]]; then + case "$command" in + push) + _bpftool_get_map_names_for_type stack + ;; + enqueue) + _bpftool_get_map_names_for_type queue + ;; + *) + _bpftool_get_map_names + ;; + esac + return 0 + fi + + # Name to complete is for a value. It can be either prog name or map name. This + # depends on the type of the map to update. + local type=$(_bpftool_map_guess_map_type) + case $type in + array_of_maps|hash_of_maps) + _bpftool_get_map_names + return 0 + ;; + prog_array) + _bpftool_get_prog_names + return 0 + ;; + *) + return 0 + ;; + esac +} + _bpftool() { local cur prev words objword @@ -207,10 +268,6 @@ _bpftool() _bpftool_get_prog_tags return 0 ;; - name) - _bpftool_get_prog_names - return 0 - ;; dev) _sysfs_get_netdevs return 0 @@ -261,7 +318,8 @@ _bpftool() # Completion depends on object and command in use case $object in prog) - # Complete id, only for subcommands that use prog (but no map) ids + # Complete id and name, only for subcommands that use prog (but no + # map) ids/names. case $command in show|list|dump|pin) case $prev in @@ -269,12 +327,16 @@ _bpftool() _bpftool_get_prog_ids return 0 ;; + name) + _bpftool_get_prog_names + return 0 + ;; esac ;; esac local PROG_TYPE='id pinned tag name' - local MAP_TYPE='id pinned' + local MAP_TYPE='id pinned name' case $command in show|list) [[ $prev != "$command" ]] && return 0 @@ -325,6 +387,9 @@ _bpftool() id) _bpftool_get_prog_ids ;; + name) + _bpftool_get_map_names + ;; pinned) _filedir ;; @@ -345,6 +410,9 @@ _bpftool() id) _bpftool_get_map_ids ;; + name) + _bpftool_get_map_names + ;; pinned) _filedir ;; @@ -409,6 +477,10 @@ _bpftool() _bpftool_get_map_ids return 0 ;; + name) + _bpftool_get_map_names + return 0 + ;; pinned|pinmaps) _filedir return 0 @@ -457,7 +529,7 @@ _bpftool() esac ;; map) - local MAP_TYPE='id pinned' + local MAP_TYPE='id pinned name' case $command in show|list|dump|peek|pop|dequeue|freeze) case $prev in @@ -483,6 +555,24 @@ _bpftool() esac return 0 ;; + name) + case "$command" in + peek) + _bpftool_get_map_names_for_type stack + _bpftool_get_map_names_for_type queue + ;; + pop) + _bpftool_get_map_names_for_type stack + ;; + dequeue) + _bpftool_get_map_names_for_type queue + ;; + *) + _bpftool_get_map_names + ;; + esac + return 0 + ;; *) return 0 ;; @@ -530,6 +620,10 @@ _bpftool() _bpftool_get_map_ids return 0 ;; + name) + _bpftool_get_map_names + return 0 + ;; key) COMPREPLY+=( $( compgen -W 'hex' -- "$cur" ) ) ;; @@ -555,6 +649,10 @@ _bpftool() _bpftool_map_update_get_id $command return 0 ;; + name) + _bpftool_map_update_get_name $command + return 0 + ;; key) COMPREPLY+=( $( compgen -W 'hex' -- "$cur" ) ) ;; @@ -563,7 +661,7 @@ _bpftool() # map, depending on the type of the map to update. case "$(_bpftool_map_guess_map_type)" in array_of_maps|hash_of_maps) - local MAP_TYPE='id pinned' + local MAP_TYPE='id pinned name' COMPREPLY+=( $( compgen -W "$MAP_TYPE" \ -- "$cur" ) ) return 0 @@ -631,6 +729,10 @@ _bpftool() _bpftool_get_map_ids_for_type perf_event_array return 0 ;; + name) + _bpftool_get_map_names_for_type perf_event_array + return 0 + ;; cpu) return 0 ;; @@ -655,7 +757,7 @@ _bpftool() ;; btf) local PROG_TYPE='id pinned tag name' - local MAP_TYPE='id pinned' + local MAP_TYPE='id pinned name' case $command in dump) case $prev in @@ -686,6 +788,17 @@ _bpftool() esac return 0 ;; + name) + case $pprev in + prog) + _bpftool_get_prog_names + ;; + map) + _bpftool_get_map_names + ;; + esac + return 0 + ;; format) COMPREPLY=( $( compgen -W "c raw" -- "$cur" ) ) ;; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index a7ead7bb9447..81890f4c8cca 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -47,7 +47,7 @@ "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} |\n" \ "\t {-m|--mapcompat} | {-n|--nomount} }" #define HELP_SPEC_MAP \ - "MAP := { id MAP_ID | pinned FILE }" + "MAP := { id MAP_ID | pinned FILE | name MAP_NAME }" static const char * const prog_type_name[] = { [BPF_PROG_TYPE_UNSPEC] = "unspec", diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index de61d73b9030..c01f76fa6876 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -91,10 +91,66 @@ static void *alloc_value(struct bpf_map_info *info) return malloc(info->value_size); } -int map_parse_fd(int *argc, char ***argv) +static int map_fd_by_name(char *name, int **fds) { - int fd; + unsigned int id = 0; + int fd, nb_fds = 0; + void *tmp; + int err; + while (true) { + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + + err = bpf_map_get_next_id(id, &id); + if (err) { + if (errno != ENOENT) { + p_err("%s", strerror(errno)); + goto err_close_fds; + } + return nb_fds; + } + + fd = bpf_map_get_fd_by_id(id); + if (fd < 0) { + p_err("can't get map by id (%u): %s", + id, strerror(errno)); + goto err_close_fds; + } + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get map info (%u): %s", + id, strerror(errno)); + goto err_close_fd; + } + + if (strncmp(name, info.name, BPF_OBJ_NAME_LEN)) { + close(fd); + continue; + } + + if (nb_fds > 0) { + tmp = realloc(*fds, (nb_fds + 1) * sizeof(int)); + if (!tmp) { + p_err("failed to realloc"); + goto err_close_fd; + } + *fds = tmp; + } + (*fds)[nb_fds++] = fd; + } + +err_close_fd: + close(fd); +err_close_fds: + while (--nb_fds >= 0) + close((*fds)[nb_fds]); + return -1; +} + +static int map_parse_fds(int *argc, char ***argv, int **fds) +{ if (is_prefix(**argv, "id")) { unsigned int id; char *endptr; @@ -108,10 +164,25 @@ int map_parse_fd(int *argc, char ***argv) } NEXT_ARGP(); - fd = bpf_map_get_fd_by_id(id); - if (fd < 0) + (*fds)[0] = bpf_map_get_fd_by_id(id); + if ((*fds)[0] < 0) { p_err("get map by id (%u): %s", id, strerror(errno)); - return fd; + return -1; + } + return 1; + } else if (is_prefix(**argv, "name")) { + char *name; + + NEXT_ARGP(); + + name = **argv; + if (strlen(name) > BPF_OBJ_NAME_LEN - 1) { + p_err("can't parse name"); + return -1; + } + NEXT_ARGP(); + + return map_fd_by_name(name, fds); } else if (is_prefix(**argv, "pinned")) { char *path; @@ -120,13 +191,43 @@ int map_parse_fd(int *argc, char ***argv) path = **argv; NEXT_ARGP(); - return open_obj_pinned_any(path, BPF_OBJ_MAP); + (*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_MAP); + if ((*fds)[0] < 0) + return -1; + return 1; } - p_err("expected 'id' or 'pinned', got: '%s'?", **argv); + p_err("expected 'id', 'name' or 'pinned', got: '%s'?", **argv); return -1; } +int map_parse_fd(int *argc, char ***argv) +{ + int *fds = NULL; + int nb_fds, fd; + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = map_parse_fds(argc, argv, &fds); + if (nb_fds != 1) { + if (nb_fds > 1) { + p_err("several maps match this handle"); + while (nb_fds--) + close(fds[nb_fds]); + } + fd = -1; + goto exit_free; + } + + fd = fds[0]; +exit_free: + free(fds); + return fd; +} + int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len) { int err; @@ -479,6 +580,21 @@ static int parse_elem(char **argv, struct bpf_map_info *info, return -1; } +static void show_map_header_json(struct bpf_map_info *info, json_writer_t *wtr) +{ + jsonw_uint_field(wtr, "id", info->id); + if (info->type < ARRAY_SIZE(map_type_name)) + jsonw_string_field(wtr, "type", map_type_name[info->type]); + else + jsonw_uint_field(wtr, "type", info->type); + + if (*info->name) + jsonw_string_field(wtr, "name", info->name); + + jsonw_name(wtr, "flags"); + jsonw_printf(wtr, "%d", info->map_flags); +} + static int show_map_close_json(int fd, struct bpf_map_info *info) { char *memlock, *frozen_str; @@ -489,18 +605,7 @@ static int show_map_close_json(int fd, struct bpf_map_info *info) jsonw_start_object(json_wtr); - jsonw_uint_field(json_wtr, "id", info->id); - if (info->type < ARRAY_SIZE(map_type_name)) - jsonw_string_field(json_wtr, "type", - map_type_name[info->type]); - else - jsonw_uint_field(json_wtr, "type", info->type); - - if (*info->name) - jsonw_string_field(json_wtr, "name", info->name); - - jsonw_name(json_wtr, "flags"); - jsonw_printf(json_wtr, "%d", info->map_flags); + show_map_header_json(info, json_wtr); print_dev_json(info->ifindex, info->netns_dev, info->netns_ino); @@ -561,14 +666,8 @@ static int show_map_close_json(int fd, struct bpf_map_info *info) return 0; } -static int show_map_close_plain(int fd, struct bpf_map_info *info) +static void show_map_header_plain(struct bpf_map_info *info) { - char *memlock, *frozen_str; - int frozen = 0; - - memlock = get_fdinfo(fd, "memlock"); - frozen_str = get_fdinfo(fd, "frozen"); - printf("%u: ", info->id); if (info->type < ARRAY_SIZE(map_type_name)) printf("%s ", map_type_name[info->type]); @@ -581,6 +680,17 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info) printf("flags 0x%x", info->map_flags); print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino); printf("\n"); +} + +static int show_map_close_plain(int fd, struct bpf_map_info *info) +{ + char *memlock, *frozen_str; + int frozen = 0; + + memlock = get_fdinfo(fd, "memlock"); + frozen_str = get_fdinfo(fd, "frozen"); + + show_map_header_plain(info); printf("\tkey %uB value %uB max_entries %u", info->key_size, info->value_size, info->max_entries); @@ -642,6 +752,50 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info) return 0; } +static int do_show_subset(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + int *fds = NULL; + int nb_fds, i; + int err = -1; + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = map_parse_fds(&argc, &argv, &fds); + if (nb_fds < 1) + goto exit_free; + + if (json_output && nb_fds > 1) + jsonw_start_array(json_wtr); /* root array */ + for (i = 0; i < nb_fds; i++) { + err = bpf_obj_get_info_by_fd(fds[i], &info, &len); + if (err) { + p_err("can't get map info: %s", + strerror(errno)); + for (; i < nb_fds; i++) + close(fds[i]); + break; + } + + if (json_output) + show_map_close_json(fds[i], &info); + else + show_map_close_plain(fds[i], &info); + + close(fds[i]); + } + if (json_output && nb_fds > 1) + jsonw_end_array(json_wtr); /* root array */ + +exit_free: + free(fds); + return err; +} + static int do_show(int argc, char **argv) { struct bpf_map_info info = {}; @@ -653,16 +807,8 @@ static int do_show(int argc, char **argv) if (show_pinned) build_pinned_obj_table(&map_table, BPF_OBJ_MAP); - if (argc == 2) { - fd = map_parse_fd_and_info(&argc, &argv, &info, &len); - if (fd < 0) - return -1; - - if (json_output) - return show_map_close_json(fd, &info); - else - return show_map_close_plain(fd, &info); - } + if (argc == 2) + return do_show_subset(argc, argv); if (argc) return BAD_ARG(); @@ -765,26 +911,49 @@ static int dump_map_elem(int fd, void *key, void *value, return 0; } -static int do_dump(int argc, char **argv) +static int maps_have_btf(int *fds, int nb_fds) { struct bpf_map_info info = {}; + __u32 len = sizeof(info); + struct btf *btf = NULL; + int err, i; + + for (i = 0; i < nb_fds; i++) { + err = bpf_obj_get_info_by_fd(fds[i], &info, &len); + if (err) { + p_err("can't get map info: %s", strerror(errno)); + goto err_close; + } + + err = btf__get_from_id(info.btf_id, &btf); + if (err) { + p_err("failed to get btf"); + goto err_close; + } + + if (!btf) + return 0; + } + + return 1; + +err_close: + for (; i < nb_fds; i++) + close(fds[i]); + return -1; +} + +static int +map_dump(int fd, struct bpf_map_info *info, json_writer_t *wtr, + bool enable_btf, bool show_header) +{ void *key, *value, *prev_key; unsigned int num_elems = 0; - __u32 len = sizeof(info); - json_writer_t *btf_wtr; struct btf *btf = NULL; int err; - int fd; - if (argc != 2) - usage(); - - fd = map_parse_fd_and_info(&argc, &argv, &info, &len); - if (fd < 0) - return -1; - - key = malloc(info.key_size); - value = alloc_value(&info); + key = malloc(info->key_size); + value = alloc_value(info); if (!key || !value) { p_err("mem alloc failed"); err = -1; @@ -793,30 +962,32 @@ static int do_dump(int argc, char **argv) prev_key = NULL; - err = btf__get_from_id(info.btf_id, &btf); - if (err) { - p_err("failed to get btf"); - goto exit_free; + if (enable_btf) { + err = btf__get_from_id(info->btf_id, &btf); + if (err || !btf) { + /* enable_btf is true only if we've already checked + * that all maps have BTF information. + */ + p_err("failed to get btf"); + goto exit_free; + } } - if (json_output) - jsonw_start_array(json_wtr); - else - if (btf) { - btf_wtr = get_btf_writer(); - if (!btf_wtr) { - p_info("failed to create json writer for btf. falling back to plain output"); - btf__free(btf); - btf = NULL; - } else { - jsonw_start_array(btf_wtr); - } + if (wtr) { + if (show_header) { + jsonw_start_object(wtr); /* map object */ + show_map_header_json(info, wtr); + jsonw_name(wtr, "elements"); } + jsonw_start_array(wtr); /* elements */ + } else if (show_header) { + show_map_header_plain(info); + } - if (info.type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY && - info.value_size != 8) + if (info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY && + info->value_size != 8) p_info("Warning: cannot read values from %s map with value_size != 8", - map_type_name[info.type]); + map_type_name[info->type]); while (true) { err = bpf_map_get_next_key(fd, prev_key, key); if (err) { @@ -824,15 +995,14 @@ static int do_dump(int argc, char **argv) err = 0; break; } - num_elems += dump_map_elem(fd, key, value, &info, btf, btf_wtr); + num_elems += dump_map_elem(fd, key, value, info, btf, wtr); prev_key = key; } - if (json_output) - jsonw_end_array(json_wtr); - else if (btf) { - jsonw_end_array(btf_wtr); - jsonw_destroy(&btf_wtr); + if (wtr) { + jsonw_end_array(wtr); /* elements */ + if (show_header) + jsonw_end_object(wtr); /* map object */ } else { printf("Found %u element%s\n", num_elems, num_elems != 1 ? "s" : ""); @@ -847,6 +1017,72 @@ exit_free: return err; } +static int do_dump(int argc, char **argv) +{ + json_writer_t *wtr = NULL, *btf_wtr = NULL; + struct bpf_map_info info = {}; + int nb_fds, i = 0, btf = 0; + __u32 len = sizeof(info); + int *fds = NULL; + int err = -1; + + if (argc != 2) + usage(); + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = map_parse_fds(&argc, &argv, &fds); + if (nb_fds < 1) + goto exit_free; + + if (json_output) { + wtr = json_wtr; + } else { + btf = maps_have_btf(fds, nb_fds); + if (btf < 0) + goto exit_close; + if (btf) { + btf_wtr = get_btf_writer(); + if (btf_wtr) { + wtr = btf_wtr; + } else { + p_info("failed to create json writer for btf. falling back to plain output"); + btf = 0; + } + } + } + + if (wtr && nb_fds > 1) + jsonw_start_array(wtr); /* root array */ + for (i = 0; i < nb_fds; i++) { + if (bpf_obj_get_info_by_fd(fds[i], &info, &len)) { + p_err("can't get map info: %s", strerror(errno)); + break; + } + err = map_dump(fds[i], &info, wtr, btf, nb_fds > 1); + if (!wtr && i != nb_fds - 1) + printf("\n"); + + if (err) + break; + close(fds[i]); + } + if (wtr && nb_fds > 1) + jsonw_end_array(wtr); /* root array */ + + if (btf) + jsonw_destroy(&btf_wtr); +exit_close: + for (; i < nb_fds; i++) + close(fds[i]); +exit_free: + free(fds); + return err; +} + static int alloc_key_value(struct bpf_map_info *info, void **key, void **value) { *key = NULL; From aa915931ac3e53ccf371308e6750da510e3591dd Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 13 Dec 2019 07:11:14 -0300 Subject: [PATCH 032/127] libbpf: Fix readelf output parsing for Fedora Fedora binutils has been patched to show "other info" for a symbol at the end of the line. This was done in order to support unmaintained scripts that would break with the extra info. [1] [1] https://src.fedoraproject.org/rpms/binutils/c/b8265c46f7ddae23a792ee8306fbaaeacba83bf8 This in turn has been done to fix the build of ruby, because of checksec. [2] Thanks Michael Ellerman for the pointer. [2] https://bugzilla.redhat.com/show_bug.cgi?id=1479302 As libbpf Makefile is not unmaintained, we can simply deal with either output format, by just removing the "other info" field, as it always comes inside brackets. Fixes: 3464afdf11f9 (libbpf: Fix readelf output parsing on powerpc with recent binutils) Reported-by: Justin Forbes Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: Aurelien Jarno Link: https://lore.kernel.org/bpf/20191213101114.GA3986@calabresa --- tools/lib/bpf/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index defae23a0169..23ae06c43d08 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -147,6 +147,7 @@ TAGS_PROG := $(if $(shell which etags 2>/dev/null),etags,ctags) GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN_SHARED) | \ cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \ + sed 's/\[.*\]//' | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}' | \ sort -u | wc -l) VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \ @@ -213,6 +214,7 @@ check_abi: $(OUTPUT)libbpf.so "versioned in $(VERSION_SCRIPT)." >&2; \ readelf -s --wide $(BPF_IN_SHARED) | \ cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \ + sed 's/\[.*\]//' | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}'| \ sort -u > $(OUTPUT)libbpf_global_syms.tmp; \ readelf -s --wide $(OUTPUT)libbpf.so | \ From 0d13bfce023ac7cef4d0a50b83750254ce31c479 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:25 -0800 Subject: [PATCH 033/127] libbpf: Don't require root for bpf_object__open() Reorganize bpf_object__open and bpf_object__load steps such that bpf_object__open doesn't need root access. This was previously done for feature probing and BTF sanitization. This doesn't have to happen on open, though, so move all those steps into the load phase. This is important, because it makes it possible for tools like bpftool, to just open BPF object file and inspect their contents: programs, maps, BTF, etc. For such operations it is prohibitive to require root access. On the other hand, there is a lot of custom libbpf logic in those steps, so its best avoided for tools to reimplement all that on their own. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-2-andriin@fb.com --- tools/lib/bpf/libbpf.c | 83 +++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 27d5f7ecba32..dc993112b40b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -101,13 +101,6 @@ void libbpf_print(enum libbpf_print_level level, const char *format, ...) #define STRERR_BUFSIZE 128 -#define CHECK_ERR(action, err, out) do { \ - err = action; \ - if (err) \ - goto out; \ -} while (0) - - /* Copied from tools/perf/util/util.h */ #ifndef zfree # define zfree(ptr) ({ free(*ptr); *ptr = NULL; }) @@ -864,8 +857,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, def->value_size = data->d_size; def->max_entries = 1; def->map_flags = type == LIBBPF_MAP_RODATA ? BPF_F_RDONLY_PROG : 0; - if (obj->caps.array_mmap) - def->map_flags |= BPF_F_MMAPABLE; + def->map_flags |= BPF_F_MMAPABLE; pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", map_name, map->sec_idx, map->sec_offset, def->map_flags); @@ -888,8 +880,6 @@ static int bpf_object__init_global_data_maps(struct bpf_object *obj) { int err; - if (!obj->caps.global_data) - return 0; /* * Populate obj->maps with libbpf internal maps. */ @@ -1393,10 +1383,11 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, return 0; } -static int bpf_object__init_maps(struct bpf_object *obj, bool relaxed_maps, - const char *pin_root_path) +static int bpf_object__init_maps(struct bpf_object *obj, + struct bpf_object_open_opts *opts) { - bool strict = !relaxed_maps; + const char *pin_root_path = OPTS_GET(opts, pin_root_path, NULL); + bool strict = !OPTS_GET(opts, relaxed_maps, false); int err; err = bpf_object__init_user_maps(obj, strict); @@ -1592,8 +1583,7 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) return 0; } -static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps, - const char *pin_root_path) +static int bpf_object__elf_collect(struct bpf_object *obj) { Elf *elf = obj->efile.elf; GElf_Ehdr *ep = &obj->efile.ehdr; @@ -1728,14 +1718,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps, pr_warn("Corrupted ELF file: index of strtab invalid\n"); return -LIBBPF_ERRNO__FORMAT; } - err = bpf_object__init_btf(obj, btf_data, btf_ext_data); - if (!err) - err = bpf_object__init_maps(obj, relaxed_maps, pin_root_path); - if (!err) - err = bpf_object__sanitize_and_load_btf(obj); - if (!err) - err = bpf_object__init_prog_names(obj); - return err; + return bpf_object__init_btf(obj, btf_data, btf_ext_data); } static struct bpf_program * @@ -1876,11 +1859,6 @@ static int bpf_program__record_reloc(struct bpf_program *prog, pr_warn("bad data relo against section %u\n", shdr_idx); return -LIBBPF_ERRNO__RELOC; } - if (!obj->caps.global_data) { - pr_warn("relocation: kernel does not support global \'%s\' variable access in insns[%d]\n", - name, insn_idx); - return -LIBBPF_ERRNO__RELOC; - } for (map_idx = 0; map_idx < nr_maps; map_idx++) { map = &obj->maps[map_idx]; if (map->libbpf_type != type) @@ -3918,12 +3896,10 @@ static struct bpf_object * __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, struct bpf_object_open_opts *opts) { - const char *pin_root_path; struct bpf_program *prog; struct bpf_object *obj; const char *obj_name; char tmp_name[64]; - bool relaxed_maps; __u32 attach_prog_fd; int err; @@ -3953,16 +3929,16 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, return obj; obj->relaxed_core_relocs = OPTS_GET(opts, relaxed_core_relocs, false); - relaxed_maps = OPTS_GET(opts, relaxed_maps, false); - pin_root_path = OPTS_GET(opts, pin_root_path, NULL); attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0); - CHECK_ERR(bpf_object__elf_init(obj), err, out); - CHECK_ERR(bpf_object__check_endianness(obj), err, out); - CHECK_ERR(bpf_object__probe_caps(obj), err, out); - CHECK_ERR(bpf_object__elf_collect(obj, relaxed_maps, pin_root_path), - err, out); - CHECK_ERR(bpf_object__collect_reloc(obj), err, out); + err = bpf_object__elf_init(obj); + err = err ? : bpf_object__check_endianness(obj); + err = err ? : bpf_object__elf_collect(obj); + err = err ? : bpf_object__init_maps(obj, opts); + err = err ? : bpf_object__init_prog_names(obj); + err = err ? : bpf_object__collect_reloc(obj); + if (err) + goto out; bpf_object__elf_finish(obj); bpf_object__for_each_program(prog, obj) { @@ -4080,6 +4056,24 @@ int bpf_object__unload(struct bpf_object *obj) return 0; } +static int bpf_object__sanitize_maps(struct bpf_object *obj) +{ + struct bpf_map *m; + + bpf_object__for_each_map(m, obj) { + if (!bpf_map__is_internal(m)) + continue; + if (!obj->caps.global_data) { + pr_warn("kernel doesn't support global data\n"); + return -ENOTSUP; + } + if (!obj->caps.array_mmap) + m->def.map_flags ^= BPF_F_MMAPABLE; + } + + return 0; +} + int bpf_object__load_xattr(struct bpf_object_load_attr *attr) { struct bpf_object *obj; @@ -4098,9 +4092,14 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) obj->loaded = true; - CHECK_ERR(bpf_object__create_maps(obj), err, out); - CHECK_ERR(bpf_object__relocate(obj, attr->target_btf_path), err, out); - CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out); + err = bpf_object__probe_caps(obj); + err = err ? : bpf_object__sanitize_and_load_btf(obj); + err = err ? : bpf_object__sanitize_maps(obj); + err = err ? : bpf_object__create_maps(obj); + err = err ? : bpf_object__relocate(obj, attr->target_btf_path); + err = err ? : bpf_object__load_progs(obj, attr->log_level); + if (err) + goto out; return 0; out: From d7a18ea7e8b612669acd0131fd075e5c735c1ce5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:26 -0800 Subject: [PATCH 034/127] libbpf: Add generic bpf_program__attach() Generalize BPF program attaching and allow libbpf to auto-detect type (and extra parameters, where applicable) and attach supported BPF program types based on program sections. Currently this is supported for: - kprobe/kretprobe; - tracepoint; - raw tracepoint; - tracing programs (typed raw TP/fentry/fexit). More types support can be trivially added within this framework. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-3-andriin@fb.com --- tools/lib/bpf/libbpf.c | 181 ++++++++++++++---- tools/lib/bpf/libbpf.h | 2 + tools/lib/bpf/libbpf.map | 2 + .../selftests/bpf/prog_tests/probe_user.c | 6 +- 4 files changed, 153 insertions(+), 38 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index dc993112b40b..61b8cdf78332 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4972,7 +4972,28 @@ void bpf_program__set_expected_attach_type(struct bpf_program *prog, */ #define BPF_APROG_COMPAT(string, ptype) BPF_PROG_SEC(string, ptype) -static const struct { +#define SEC_DEF(sec_pfx, ptype, ...) { \ + .sec = sec_pfx, \ + .len = sizeof(sec_pfx) - 1, \ + .prog_type = BPF_PROG_TYPE_##ptype, \ + __VA_ARGS__ \ +} + +struct bpf_sec_def; + +typedef struct bpf_link *(*attach_fn_t)(const struct bpf_sec_def *sec, + struct bpf_program *prog); + +static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec, + struct bpf_program *prog); +static struct bpf_link *attach_tp(const struct bpf_sec_def *sec, + struct bpf_program *prog); +static struct bpf_link *attach_raw_tp(const struct bpf_sec_def *sec, + struct bpf_program *prog); +static struct bpf_link *attach_trace(const struct bpf_sec_def *sec, + struct bpf_program *prog); + +struct bpf_sec_def { const char *sec; size_t len; enum bpf_prog_type prog_type; @@ -4980,25 +5001,40 @@ static const struct { bool is_attachable; bool is_attach_btf; enum bpf_attach_type attach_type; -} section_names[] = { + attach_fn_t attach_fn; +}; + +static const struct bpf_sec_def section_defs[] = { BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), BPF_PROG_SEC("sk_reuseport", BPF_PROG_TYPE_SK_REUSEPORT), - BPF_PROG_SEC("kprobe/", BPF_PROG_TYPE_KPROBE), + SEC_DEF("kprobe/", KPROBE, + .attach_fn = attach_kprobe), BPF_PROG_SEC("uprobe/", BPF_PROG_TYPE_KPROBE), - BPF_PROG_SEC("kretprobe/", BPF_PROG_TYPE_KPROBE), + SEC_DEF("kretprobe/", KPROBE, + .attach_fn = attach_kprobe), BPF_PROG_SEC("uretprobe/", BPF_PROG_TYPE_KPROBE), BPF_PROG_SEC("classifier", BPF_PROG_TYPE_SCHED_CLS), BPF_PROG_SEC("action", BPF_PROG_TYPE_SCHED_ACT), - BPF_PROG_SEC("tracepoint/", BPF_PROG_TYPE_TRACEPOINT), - BPF_PROG_SEC("tp/", BPF_PROG_TYPE_TRACEPOINT), - BPF_PROG_SEC("raw_tracepoint/", BPF_PROG_TYPE_RAW_TRACEPOINT), - BPF_PROG_SEC("raw_tp/", BPF_PROG_TYPE_RAW_TRACEPOINT), - BPF_PROG_BTF("tp_btf/", BPF_PROG_TYPE_TRACING, - BPF_TRACE_RAW_TP), - BPF_PROG_BTF("fentry/", BPF_PROG_TYPE_TRACING, - BPF_TRACE_FENTRY), - BPF_PROG_BTF("fexit/", BPF_PROG_TYPE_TRACING, - BPF_TRACE_FEXIT), + SEC_DEF("tracepoint/", TRACEPOINT, + .attach_fn = attach_tp), + SEC_DEF("tp/", TRACEPOINT, + .attach_fn = attach_tp), + SEC_DEF("raw_tracepoint/", RAW_TRACEPOINT, + .attach_fn = attach_raw_tp), + SEC_DEF("raw_tp/", RAW_TRACEPOINT, + .attach_fn = attach_raw_tp), + SEC_DEF("tp_btf/", TRACING, + .expected_attach_type = BPF_TRACE_RAW_TP, + .is_attach_btf = true, + .attach_fn = attach_trace), + SEC_DEF("fentry/", TRACING, + .expected_attach_type = BPF_TRACE_FENTRY, + .is_attach_btf = true, + .attach_fn = attach_trace), + SEC_DEF("fexit/", TRACING, + .expected_attach_type = BPF_TRACE_FEXIT, + .is_attach_btf = true, + .attach_fn = attach_trace), BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), @@ -5060,12 +5096,26 @@ static const struct { #undef BPF_APROG_SEC #undef BPF_EAPROG_SEC #undef BPF_APROG_COMPAT +#undef SEC_DEF #define MAX_TYPE_NAME_SIZE 32 +static const struct bpf_sec_def *find_sec_def(const char *sec_name) +{ + int i, n = ARRAY_SIZE(section_defs); + + for (i = 0; i < n; i++) { + if (strncmp(sec_name, + section_defs[i].sec, section_defs[i].len)) + continue; + return §ion_defs[i]; + } + return NULL; +} + static char *libbpf_get_type_names(bool attach_type) { - int i, len = ARRAY_SIZE(section_names) * MAX_TYPE_NAME_SIZE; + int i, len = ARRAY_SIZE(section_defs) * MAX_TYPE_NAME_SIZE; char *buf; buf = malloc(len); @@ -5074,16 +5124,16 @@ static char *libbpf_get_type_names(bool attach_type) buf[0] = '\0'; /* Forge string buf with all available names */ - for (i = 0; i < ARRAY_SIZE(section_names); i++) { - if (attach_type && !section_names[i].is_attachable) + for (i = 0; i < ARRAY_SIZE(section_defs); i++) { + if (attach_type && !section_defs[i].is_attachable) continue; - if (strlen(buf) + strlen(section_names[i].sec) + 2 > len) { + if (strlen(buf) + strlen(section_defs[i].sec) + 2 > len) { free(buf); return NULL; } strcat(buf, " "); - strcat(buf, section_names[i].sec); + strcat(buf, section_defs[i].sec); } return buf; @@ -5092,19 +5142,19 @@ static char *libbpf_get_type_names(bool attach_type) int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, enum bpf_attach_type *expected_attach_type) { + const struct bpf_sec_def *sec_def; char *type_names; - int i; if (!name) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(section_names); i++) { - if (strncmp(name, section_names[i].sec, section_names[i].len)) - continue; - *prog_type = section_names[i].prog_type; - *expected_attach_type = section_names[i].expected_attach_type; + sec_def = find_sec_def(name); + if (sec_def) { + *prog_type = sec_def->prog_type; + *expected_attach_type = sec_def->expected_attach_type; return 0; } + pr_warn("failed to guess program type from ELF section '%s'\n", name); type_names = libbpf_get_type_names(false); if (type_names != NULL) { @@ -5187,16 +5237,16 @@ static int libbpf_find_attach_btf_id(const char *name, if (!name) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(section_names); i++) { - if (!section_names[i].is_attach_btf) + for (i = 0; i < ARRAY_SIZE(section_defs); i++) { + if (!section_defs[i].is_attach_btf) continue; - if (strncmp(name, section_names[i].sec, section_names[i].len)) + if (strncmp(name, section_defs[i].sec, section_defs[i].len)) continue; if (attach_prog_fd) - err = libbpf_find_prog_btf_id(name + section_names[i].len, + err = libbpf_find_prog_btf_id(name + section_defs[i].len, attach_prog_fd); else - err = libbpf_find_vmlinux_btf_id(name + section_names[i].len, + err = libbpf_find_vmlinux_btf_id(name + section_defs[i].len, attach_type); if (err <= 0) pr_warn("%s is not found in vmlinux BTF\n", name); @@ -5215,12 +5265,12 @@ int libbpf_attach_type_by_name(const char *name, if (!name) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(section_names); i++) { - if (strncmp(name, section_names[i].sec, section_names[i].len)) + for (i = 0; i < ARRAY_SIZE(section_defs); i++) { + if (strncmp(name, section_defs[i].sec, section_defs[i].len)) continue; - if (!section_names[i].is_attachable) + if (!section_defs[i].is_attachable) return -EINVAL; - *attach_type = section_names[i].attach_type; + *attach_type = section_defs[i].attach_type; return 0; } pr_warn("failed to guess attach type based on ELF section name '%s'\n", name); @@ -5680,6 +5730,18 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog, return link; } +static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec, + struct bpf_program *prog) +{ + const char *func_name; + bool retprobe; + + func_name = bpf_program__title(prog, false) + sec->len; + retprobe = strcmp(sec->sec, "kretprobe/") == 0; + + return bpf_program__attach_kprobe(prog, retprobe, func_name); +} + struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog, bool retprobe, pid_t pid, const char *binary_path, @@ -5792,6 +5854,32 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog, return link; } +static struct bpf_link *attach_tp(const struct bpf_sec_def *sec, + struct bpf_program *prog) +{ + char *sec_name, *tp_cat, *tp_name; + struct bpf_link *link; + + sec_name = strdup(bpf_program__title(prog, false)); + if (!sec_name) + return ERR_PTR(-ENOMEM); + + /* extract "tp//" */ + tp_cat = sec_name + sec->len; + tp_name = strchr(tp_cat, '/'); + if (!tp_name) { + link = ERR_PTR(-EINVAL); + goto out; + } + *tp_name = '\0'; + tp_name++; + + link = bpf_program__attach_tracepoint(prog, tp_cat, tp_name); +out: + free(sec_name); + return link; +} + static int bpf_link__destroy_fd(struct bpf_link *link) { struct bpf_link_fd *l = (void *)link; @@ -5831,6 +5919,14 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, return (struct bpf_link *)link; } +static struct bpf_link *attach_raw_tp(const struct bpf_sec_def *sec, + struct bpf_program *prog) +{ + const char *tp_name = bpf_program__title(prog, false) + sec->len; + + return bpf_program__attach_raw_tracepoint(prog, tp_name); +} + struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog) { char errmsg[STRERR_BUFSIZE]; @@ -5862,6 +5958,23 @@ struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog) return (struct bpf_link *)link; } +static struct bpf_link *attach_trace(const struct bpf_sec_def *sec, + struct bpf_program *prog) +{ + return bpf_program__attach_trace(prog); +} + +struct bpf_link *bpf_program__attach(struct bpf_program *prog) +{ + const struct bpf_sec_def *sec_def; + + sec_def = find_sec_def(bpf_program__title(prog, false)); + if (!sec_def || !sec_def->attach_fn) + return ERR_PTR(-ESRCH); + + return sec_def->attach_fn(sec_def, prog); +} + enum bpf_perf_event_ret bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, void **copy_mem, size_t *copy_size, diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 0dbf4bfba0c4..804f445c9957 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -237,6 +237,8 @@ struct bpf_link; LIBBPF_API int bpf_link__destroy(struct bpf_link *link); +LIBBPF_API struct bpf_link * +bpf_program__attach(struct bpf_program *prog); LIBBPF_API struct bpf_link * bpf_program__attach_perf_event(struct bpf_program *prog, int pfd); LIBBPF_API struct bpf_link * diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 495df575f87f..757a88f64b5a 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -210,4 +210,6 @@ LIBBPF_0.0.6 { } LIBBPF_0.0.5; LIBBPF_0.0.7 { + global: + bpf_program__attach; } LIBBPF_0.0.6; diff --git a/tools/testing/selftests/bpf/prog_tests/probe_user.c b/tools/testing/selftests/bpf/prog_tests/probe_user.c index 8a3187dec048..7aecfd9e87d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/probe_user.c +++ b/tools/testing/selftests/bpf/prog_tests/probe_user.c @@ -3,8 +3,7 @@ void test_probe_user(void) { -#define kprobe_name "__sys_connect" - const char *prog_name = "kprobe/" kprobe_name; + const char *prog_name = "kprobe/__sys_connect"; const char *obj_file = "./test_probe_user.o"; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, ); int err, results_map_fd, sock_fd, duration = 0; @@ -33,8 +32,7 @@ void test_probe_user(void) "err %d\n", results_map_fd)) goto cleanup; - kprobe_link = bpf_program__attach_kprobe(kprobe_prog, false, - kprobe_name); + kprobe_link = bpf_program__attach(kprobe_prog); if (CHECK(IS_ERR(kprobe_link), "attach_kprobe", "err %ld\n", PTR_ERR(kprobe_link))) { kprobe_link = NULL; From 612d05be250aa8804d3baba7a12445a267a580d3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:27 -0800 Subject: [PATCH 035/127] libbpf: Move non-public APIs from libbpf.h to libbpf_internal.h Few libbpf APIs are not public but currently exposed through libbpf.h to be used by bpftool. Move them to libbpf_internal.h, where intent of being non-stable and non-public is much more obvious. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-4-andriin@fb.com --- tools/bpf/bpftool/net.c | 1 + tools/lib/bpf/libbpf.h | 17 ----------------- tools/lib/bpf/libbpf_internal.h | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 4f52d3151616..d93bee298e54 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -18,6 +18,7 @@ #include #include +#include "libbpf_internal.h" #include "main.h" #include "netlink_dumper.h" diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 804f445c9957..2698fbcb0c79 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -126,11 +126,6 @@ bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz, LIBBPF_API struct bpf_object * bpf_object__open_xattr(struct bpf_object_open_attr *attr); -int bpf_object__section_size(const struct bpf_object *obj, const char *name, - __u32 *size); -int bpf_object__variable_offset(const struct bpf_object *obj, const char *name, - __u32 *off); - enum libbpf_pin_type { LIBBPF_PIN_NONE, /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ @@ -514,18 +509,6 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, void **copy_mem, size_t *copy_size, bpf_perf_event_print_t fn, void *private_data); -struct nlattr; -typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); -int libbpf_netlink_open(unsigned int *nl_pid); -int libbpf_nl_get_link(int sock, unsigned int nl_pid, - libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie); -int libbpf_nl_get_class(int sock, unsigned int nl_pid, int ifindex, - libbpf_dump_nlmsg_t dump_class_nlmsg, void *cookie); -int libbpf_nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex, - libbpf_dump_nlmsg_t dump_qdisc_nlmsg, void *cookie); -int libbpf_nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle, - libbpf_dump_nlmsg_t dump_filter_nlmsg, void *cookie); - struct bpf_prog_linfo; struct bpf_prog_info; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index f4f10715db40..7d71621bb7e8 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -100,6 +100,23 @@ int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); int libbpf__load_raw_btf(const char *raw_types, size_t types_len, const char *str_sec, size_t str_len); +int bpf_object__section_size(const struct bpf_object *obj, const char *name, + __u32 *size); +int bpf_object__variable_offset(const struct bpf_object *obj, const char *name, + __u32 *off); + +struct nlattr; +typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); +int libbpf_netlink_open(unsigned int *nl_pid); +int libbpf_nl_get_link(int sock, unsigned int nl_pid, + libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie); +int libbpf_nl_get_class(int sock, unsigned int nl_pid, int ifindex, + libbpf_dump_nlmsg_t dump_class_nlmsg, void *cookie); +int libbpf_nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex, + libbpf_dump_nlmsg_t dump_qdisc_nlmsg, void *cookie); +int libbpf_nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle, + libbpf_dump_nlmsg_t dump_filter_nlmsg, void *cookie); + struct btf_ext_info { /* * info points to the individual info section (e.g. func_info and From 917f6b7b07a46e53fa73e112d23c97d1b201a877 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:28 -0800 Subject: [PATCH 036/127] libbpf: Add BPF_EMBED_OBJ macro for embedding BPF .o files Add a convenience macro BPF_EMBED_OBJ, which allows to embed other files (typically used to embed BPF .o files) into a hosting userspace programs. To C program it is exposed as struct bpf_embed_data, containing a pointer to raw data and its size in bytes. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-5-andriin@fb.com --- tools/lib/bpf/libbpf.h | 35 +++++++++++++++++++ .../selftests/bpf/prog_tests/attach_probe.c | 23 ++---------- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 2698fbcb0c79..fa803dde1f46 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -615,6 +615,41 @@ bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear); */ LIBBPF_API int libbpf_num_possible_cpus(void); +struct bpf_embed_data { + void *data; + size_t size; +}; + +#define BPF_EMBED_OBJ_DECLARE(NAME) \ +extern struct bpf_embed_data NAME##_embed; \ +extern char NAME##_data[]; \ +extern char NAME##_data_end[]; + +#define __BPF_EMBED_OBJ(NAME, PATH, SZ, ASM_TYPE) \ +asm ( \ +" .pushsection \".rodata\", \"a\", @progbits \n" \ +" .global "#NAME"_data \n" \ +#NAME"_data: \n" \ +" .incbin \"" PATH "\" \n" \ +" .global "#NAME"_data_end \n" \ +#NAME"_data_end: \n" \ +" .global "#NAME"_embed \n" \ +" .type "#NAME"_embed, @object \n" \ +" .size "#NAME"_size, "#SZ" \n" \ +" .align 8, \n" \ +#NAME"_embed: \n" \ +" "ASM_TYPE" "#NAME"_data \n" \ +" "ASM_TYPE" "#NAME"_data_end - "#NAME"_data \n" \ +" .popsection \n" \ +); \ +BPF_EMBED_OBJ_DECLARE(NAME) + +#if __SIZEOF_POINTER__ == 4 +#define BPF_EMBED_OBJ(NAME, PATH) __BPF_EMBED_OBJ(NAME, PATH, 8, ".long") +#else +#define BPF_EMBED_OBJ(NAME, PATH) __BPF_EMBED_OBJ(NAME, PATH, 16, ".quad") +#endif + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index a83111a32d4a..b2e7c1424b07 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -1,24 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include -#define EMBED_FILE(NAME, PATH) \ -asm ( \ -" .pushsection \".rodata\", \"a\", @progbits \n" \ -" .global "#NAME"_data \n" \ -#NAME"_data: \n" \ -" .incbin \"" PATH "\" \n" \ -#NAME"_data_end: \n" \ -" .global "#NAME"_size \n" \ -" .type "#NAME"_size, @object \n" \ -" .size "#NAME"_size, 4 \n" \ -" .align 4, \n" \ -#NAME"_size: \n" \ -" .int "#NAME"_data_end - "#NAME"_data \n" \ -" .popsection \n" \ -); \ -extern char NAME##_data[]; \ -extern int NAME##_size; - ssize_t get_base_addr() { size_t start; char buf[256]; @@ -39,7 +21,7 @@ ssize_t get_base_addr() { return -EINVAL; } -EMBED_FILE(probe, "test_attach_probe.o"); +BPF_EMBED_OBJ(probe, "test_attach_probe.o"); void test_attach_probe(void) { @@ -73,7 +55,8 @@ void test_attach_probe(void) uprobe_offset = (size_t)&get_base_addr - base_addr; /* open object */ - obj = bpf_object__open_mem(probe_data, probe_size, &open_opts); + obj = bpf_object__open_mem(probe_embed.data, probe_embed.size, + &open_opts); if (CHECK(IS_ERR(obj), "obj_open_mem", "err %ld\n", PTR_ERR(obj))) return; From 544402d4b49332a4a9b2b8fff20f9d9f5ef86559 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:29 -0800 Subject: [PATCH 037/127] libbpf: Extract common user-facing helpers LIBBPF_API and DECLARE_LIBBPF_OPTS are needed in many public libbpf API headers. Extract them into libbpf_common.h to avoid unnecessary interdependency between btf.h, libbpf.h, and bpf.h or code duplication. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191214014341.3442258-6-andriin@fb.com --- tools/lib/bpf/bpf.h | 6 ++---- tools/lib/bpf/btf.h | 6 ++---- tools/lib/bpf/libbpf.h | 28 ++------------------------ tools/lib/bpf/libbpf_common.h | 38 +++++++++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 34 deletions(-) create mode 100644 tools/lib/bpf/libbpf_common.h diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 3c791fa8e68e..269807ce9ef5 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -28,14 +28,12 @@ #include #include +#include "libbpf_common.h" + #ifdef __cplusplus extern "C" { #endif -#ifndef LIBBPF_API -#define LIBBPF_API __attribute__((visibility("default"))) -#endif - struct bpf_create_map_attr { const char *name; enum bpf_map_type map_type; diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index d9ac73a02cde..5fc23b988deb 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -8,14 +8,12 @@ #include #include +#include "libbpf_common.h" + #ifdef __cplusplus extern "C" { #endif -#ifndef LIBBPF_API -#define LIBBPF_API __attribute__((visibility("default"))) -#endif - #define BTF_ELF_SEC ".BTF" #define BTF_EXT_ELF_SEC ".BTF.ext" #define MAPS_ELF_SEC ".maps" diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index fa803dde1f46..49e6fa01024b 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -17,14 +17,12 @@ #include // for size_t #include +#include "libbpf_common.h" + #ifdef __cplusplus extern "C" { #endif -#ifndef LIBBPF_API -#define LIBBPF_API __attribute__((visibility("default"))) -#endif - enum libbpf_errno { __LIBBPF_ERRNO__START = 4000, @@ -67,28 +65,6 @@ struct bpf_object_open_attr { enum bpf_prog_type prog_type; }; -/* Helper macro to declare and initialize libbpf options struct - * - * This dance with uninitialized declaration, followed by memset to zero, - * followed by assignment using compound literal syntax is done to preserve - * ability to use a nice struct field initialization syntax and **hopefully** - * have all the padding bytes initialized to zero. It's not guaranteed though, - * when copying literal, that compiler won't copy garbage in literal's padding - * bytes, but that's the best way I've found and it seems to work in practice. - * - * Macro declares opts struct of given type and name, zero-initializes, - * including any extra padding, it with memset() and then assigns initial - * values provided by users in struct initializer-syntax as varargs. - */ -#define DECLARE_LIBBPF_OPTS(TYPE, NAME, ...) \ - struct TYPE NAME = ({ \ - memset(&NAME, 0, sizeof(struct TYPE)); \ - (struct TYPE) { \ - .sz = sizeof(struct TYPE), \ - __VA_ARGS__ \ - }; \ - }) - struct bpf_object_open_opts { /* size of this struct, for forward/backward compatiblity */ size_t sz; diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h new file mode 100644 index 000000000000..4fb833840961 --- /dev/null +++ b/tools/lib/bpf/libbpf_common.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Common user-facing libbpf helpers. + * + * Copyright (c) 2019 Facebook + */ + +#ifndef __LIBBPF_LIBBPF_COMMON_H +#define __LIBBPF_LIBBPF_COMMON_H + +#ifndef LIBBPF_API +#define LIBBPF_API __attribute__((visibility("default"))) +#endif + +/* Helper macro to declare and initialize libbpf options struct + * + * This dance with uninitialized declaration, followed by memset to zero, + * followed by assignment using compound literal syntax is done to preserve + * ability to use a nice struct field initialization syntax and **hopefully** + * have all the padding bytes initialized to zero. It's not guaranteed though, + * when copying literal, that compiler won't copy garbage in literal's padding + * bytes, but that's the best way I've found and it seems to work in practice. + * + * Macro declares opts struct of given type and name, zero-initializes, + * including any extra padding, it with memset() and then assigns initial + * values provided by users in struct initializer-syntax as varargs. + */ +#define DECLARE_LIBBPF_OPTS(TYPE, NAME, ...) \ + struct TYPE NAME = ({ \ + memset(&NAME, 0, sizeof(struct TYPE)); \ + (struct TYPE) { \ + .sz = sizeof(struct TYPE), \ + __VA_ARGS__ \ + }; \ + }) + +#endif /* __LIBBPF_LIBBPF_COMMON_H */ From 3d208f4ca111a614903f49d5a77b93ddc6de294e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:30 -0800 Subject: [PATCH 038/127] libbpf: Expose btf__align_of() API Expose BTF API that calculates type alignment requirements. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191214014341.3442258-7-andriin@fb.com --- tools/lib/bpf/btf.c | 39 +++++++++++++++++++++++++++++++++ tools/lib/bpf/btf.h | 1 + tools/lib/bpf/btf_dump.c | 47 +++++----------------------------------- tools/lib/bpf/libbpf.map | 1 + 4 files changed, 47 insertions(+), 41 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 88efa2bb7137..84fe82f27bef 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -278,6 +278,45 @@ done: return nelems * size; } +int btf__align_of(const struct btf *btf, __u32 id) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + __u16 kind = btf_kind(t); + + switch (kind) { + case BTF_KIND_INT: + case BTF_KIND_ENUM: + return min(sizeof(void *), t->size); + case BTF_KIND_PTR: + return sizeof(void *); + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + return btf__align_of(btf, t->type); + case BTF_KIND_ARRAY: + return btf__align_of(btf, btf_array(t)->type); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + __u16 vlen = btf_vlen(t); + int i, align = 1, t; + + for (i = 0; i < vlen; i++, m++) { + t = btf__align_of(btf, m->type); + if (t <= 0) + return t; + align = max(align, t); + } + + return align; + } + default: + pr_warn("unsupported BTF_KIND:%u\n", btf_kind(t)); + return 0; + } +} + int btf__resolve_type(const struct btf *btf, __u32 type_id) { const struct btf_type *t; diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 5fc23b988deb..a114c8ef4f08 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -77,6 +77,7 @@ LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id); LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id); LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__align_of(const struct btf *btf, __u32 id); LIBBPF_API int btf__fd(const struct btf *btf); LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size); LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index cb126d8fcf75..53393026d085 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -752,41 +752,6 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) } } -static int btf_align_of(const struct btf *btf, __u32 id) -{ - const struct btf_type *t = btf__type_by_id(btf, id); - __u16 kind = btf_kind(t); - - switch (kind) { - case BTF_KIND_INT: - case BTF_KIND_ENUM: - return min(sizeof(void *), t->size); - case BTF_KIND_PTR: - return sizeof(void *); - case BTF_KIND_TYPEDEF: - case BTF_KIND_VOLATILE: - case BTF_KIND_CONST: - case BTF_KIND_RESTRICT: - return btf_align_of(btf, t->type); - case BTF_KIND_ARRAY: - return btf_align_of(btf, btf_array(t)->type); - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: { - const struct btf_member *m = btf_members(t); - __u16 vlen = btf_vlen(t); - int i, align = 1; - - for (i = 0; i < vlen; i++, m++) - align = max(align, btf_align_of(btf, m->type)); - - return align; - } - default: - pr_warn("unsupported BTF_KIND:%u\n", btf_kind(t)); - return 1; - } -} - static bool btf_is_struct_packed(const struct btf *btf, __u32 id, const struct btf_type *t) { @@ -794,18 +759,18 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id, int align, i, bit_sz; __u16 vlen; - align = btf_align_of(btf, id); + align = btf__align_of(btf, id); /* size of a non-packed struct has to be a multiple of its alignment*/ - if (t->size % align) + if (align && t->size % align) return true; m = btf_members(t); vlen = btf_vlen(t); /* all non-bitfield fields have to be naturally aligned */ for (i = 0; i < vlen; i++, m++) { - align = btf_align_of(btf, m->type); + align = btf__align_of(btf, m->type); bit_sz = btf_member_bitfield_size(t, i); - if (bit_sz == 0 && m->offset % (8 * align) != 0) + if (align && bit_sz == 0 && m->offset % (8 * align) != 0) return true; } @@ -889,7 +854,7 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, fname = btf_name_of(d, m->name_off); m_sz = btf_member_bitfield_size(t, i); m_off = btf_member_bit_offset(t, i); - align = packed ? 1 : btf_align_of(d->btf, m->type); + align = packed ? 1 : btf__align_of(d->btf, m->type); btf_dump_emit_bit_padding(d, off, m_off, m_sz, align, lvl + 1); btf_dump_printf(d, "\n%s", pfx(lvl + 1)); @@ -907,7 +872,7 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, /* pad at the end, if necessary */ if (is_struct) { - align = packed ? 1 : btf_align_of(d->btf, id); + align = packed ? 1 : btf__align_of(d->btf, id); btf_dump_emit_bit_padding(d, off, t->size * 8, 0, align, lvl + 1); } diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 757a88f64b5a..e7fcca36f186 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -212,4 +212,5 @@ LIBBPF_0.0.6 { LIBBPF_0.0.7 { global: bpf_program__attach; + btf__align_of; } LIBBPF_0.0.6; From 9f81654eebe8de7e0db15534816d8f6c84b2e1e5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:31 -0800 Subject: [PATCH 039/127] libbpf: Expose BTF-to-C type declaration emitting API Expose API that allows to emit type declaration and field/variable definition (if optional field name is specified) in valid C syntax for any provided BTF type. This is going to be used by bpftool when emitting data section layout as a struct. As part of making this API useful in a stand-alone fashion, move initialization of some of the internal btf_dump state to earlier phase. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-8-andriin@fb.com --- tools/lib/bpf/btf.h | 22 +++++++++++++ tools/lib/bpf/btf_dump.c | 68 +++++++++++++++++++++++++--------------- tools/lib/bpf/libbpf.map | 1 + 3 files changed, 65 insertions(+), 26 deletions(-) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index a114c8ef4f08..8d73f7f5551f 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -126,6 +126,28 @@ LIBBPF_API void btf_dump__free(struct btf_dump *d); LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id); +struct btf_dump_emit_type_decl_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* optional field name for type declaration, e.g.: + * - struct my_struct + * - void (*)(int) + * - char (*)[123] + */ + const char *field_name; + /* extra indentation level (in number of tabs) to emit for multi-line + * type declarations (e.g., anonymous struct); applies for lines + * starting from the second one (first line is assumed to have + * necessary indentation already + */ + int indent_level; +}; +#define btf_dump_emit_type_decl_opts__last_field indent_level + +LIBBPF_API int +btf_dump__emit_type_decl(struct btf_dump *d, __u32 id, + const struct btf_dump_emit_type_decl_opts *opts); + /* * A set of helpers for easier BTF types handling */ diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 53393026d085..e95f7710f210 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -116,6 +116,8 @@ static void btf_dump_printf(const struct btf_dump *d, const char *fmt, ...) va_end(args); } +static int btf_dump_mark_referenced(struct btf_dump *d); + struct btf_dump *btf_dump__new(const struct btf *btf, const struct btf_ext *btf_ext, const struct btf_dump_opts *opts, @@ -137,18 +139,39 @@ struct btf_dump *btf_dump__new(const struct btf *btf, if (IS_ERR(d->type_names)) { err = PTR_ERR(d->type_names); d->type_names = NULL; - btf_dump__free(d); - return ERR_PTR(err); } d->ident_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); if (IS_ERR(d->ident_names)) { err = PTR_ERR(d->ident_names); d->ident_names = NULL; - btf_dump__free(d); - return ERR_PTR(err); + goto err; + } + d->type_states = calloc(1 + btf__get_nr_types(d->btf), + sizeof(d->type_states[0])); + if (!d->type_states) { + err = -ENOMEM; + goto err; + } + d->cached_names = calloc(1 + btf__get_nr_types(d->btf), + sizeof(d->cached_names[0])); + if (!d->cached_names) { + err = -ENOMEM; + goto err; } + /* VOID is special */ + d->type_states[0].order_state = ORDERED; + d->type_states[0].emit_state = EMITTED; + + /* eagerly determine referenced types for anon enums */ + err = btf_dump_mark_referenced(d); + if (err) + goto err; + return d; +err: + btf_dump__free(d); + return ERR_PTR(err); } void btf_dump__free(struct btf_dump *d) @@ -175,7 +198,6 @@ void btf_dump__free(struct btf_dump *d) free(d); } -static int btf_dump_mark_referenced(struct btf_dump *d); static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr); static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id); @@ -202,27 +224,6 @@ int btf_dump__dump_type(struct btf_dump *d, __u32 id) if (id > btf__get_nr_types(d->btf)) return -EINVAL; - /* type states are lazily allocated, as they might not be needed */ - if (!d->type_states) { - d->type_states = calloc(1 + btf__get_nr_types(d->btf), - sizeof(d->type_states[0])); - if (!d->type_states) - return -ENOMEM; - d->cached_names = calloc(1 + btf__get_nr_types(d->btf), - sizeof(d->cached_names[0])); - if (!d->cached_names) - return -ENOMEM; - - /* VOID is special */ - d->type_states[0].order_state = ORDERED; - d->type_states[0].emit_state = EMITTED; - - /* eagerly determine referenced types for anon enums */ - err = btf_dump_mark_referenced(d); - if (err) - return err; - } - d->emit_queue_cnt = 0; err = btf_dump_order_type(d, id, false); if (err < 0) @@ -1016,6 +1017,21 @@ static int btf_dump_push_decl_stack_id(struct btf_dump *d, __u32 id) * of a stack frame. Some care is required to "pop" stack frames after * processing type declaration chain. */ +int btf_dump__emit_type_decl(struct btf_dump *d, __u32 id, + const struct btf_dump_emit_type_decl_opts *opts) +{ + const char *fname; + int lvl; + + if (!OPTS_VALID(opts, btf_dump_emit_type_decl_opts)) + return -EINVAL; + + fname = OPTS_GET(opts, field_name, NULL); + lvl = OPTS_GET(opts, indent_level, 0); + btf_dump_emit_type_decl(d, id, fname, lvl); + return 0; +} + static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id, const char *fname, int lvl) { diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index e7fcca36f186..990c7c0e2d9f 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -211,6 +211,7 @@ LIBBPF_0.0.6 { LIBBPF_0.0.7 { global: + btf_dump__emit_type_decl; bpf_program__attach; btf__align_of; } LIBBPF_0.0.6; From 01af3bf06755dc5cda7050fe5d898998e5775e63 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:32 -0800 Subject: [PATCH 040/127] libbpf: Expose BPF program's function name Add APIs to get BPF program function name, as opposed to bpf_program__title(), which returns BPF program function's section name. Function name has a benefit of being a valid C identifier and uniquely identifies a specific BPF program, while section name can be duplicated across multiple independent BPF programs. Add also bpf_object__find_program_by_name(), similar to bpf_object__find_program_by_title(), to facilitate looking up BPF programs by their C function names. Convert one of selftests to new API for look up. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-9-andriin@fb.com --- tools/lib/bpf/libbpf.c | 28 +++++++++++++++---- tools/lib/bpf/libbpf.h | 9 ++++-- tools/lib/bpf/libbpf.map | 2 ++ .../selftests/bpf/prog_tests/rdonly_maps.c | 11 +++----- 4 files changed, 36 insertions(+), 14 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 61b8cdf78332..e7a6b57d849c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -209,8 +209,8 @@ static const char * const libbpf_type_to_btf_name[] = { }; struct bpf_map { - int fd; char *name; + int fd; int sec_idx; size_t sec_offset; int map_ifindex; @@ -1384,7 +1384,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, } static int bpf_object__init_maps(struct bpf_object *obj, - struct bpf_object_open_opts *opts) + const struct bpf_object_open_opts *opts) { const char *pin_root_path = OPTS_GET(opts, pin_root_path, NULL); bool strict = !OPTS_GET(opts, relaxed_maps, false); @@ -1748,6 +1748,19 @@ bpf_object__find_program_by_title(const struct bpf_object *obj, return NULL; } +struct bpf_program * +bpf_object__find_program_by_name(const struct bpf_object *obj, + const char *name) +{ + struct bpf_program *prog; + + bpf_object__for_each_program(prog, obj) { + if (!strcmp(prog->name, name)) + return prog; + } + return NULL; +} + static bool bpf_object__shndx_is_data(const struct bpf_object *obj, int shndx) { @@ -3894,7 +3907,7 @@ static int libbpf_find_attach_btf_id(const char *name, __u32 attach_prog_fd); static struct bpf_object * __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, - struct bpf_object_open_opts *opts) + const struct bpf_object_open_opts *opts) { struct bpf_program *prog; struct bpf_object *obj; @@ -4003,7 +4016,7 @@ struct bpf_object *bpf_object__open(const char *path) } struct bpf_object * -bpf_object__open_file(const char *path, struct bpf_object_open_opts *opts) +bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) { if (!path) return ERR_PTR(-EINVAL); @@ -4015,7 +4028,7 @@ bpf_object__open_file(const char *path, struct bpf_object_open_opts *opts) struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, - struct bpf_object_open_opts *opts) + const struct bpf_object_open_opts *opts) { if (!obj_buf || obj_buf_sz == 0) return ERR_PTR(-EINVAL); @@ -4820,6 +4833,11 @@ void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex) prog->prog_ifindex = ifindex; } +const char *bpf_program__name(const struct bpf_program *prog) +{ + return prog->name; +} + const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy) { const char *title; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 49e6fa01024b..f37bd4a3e14b 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -90,10 +90,10 @@ struct bpf_object_open_opts { LIBBPF_API struct bpf_object *bpf_object__open(const char *path); LIBBPF_API struct bpf_object * -bpf_object__open_file(const char *path, struct bpf_object_open_opts *opts); +bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts); LIBBPF_API struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, - struct bpf_object_open_opts *opts); + const struct bpf_object_open_opts *opts); /* deprecated bpf_object__open variants */ LIBBPF_API struct bpf_object * @@ -132,6 +132,7 @@ struct bpf_object_load_attr { LIBBPF_API int bpf_object__load(struct bpf_object *obj); LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr); LIBBPF_API int bpf_object__unload(struct bpf_object *obj); + LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); @@ -142,6 +143,9 @@ LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); LIBBPF_API struct bpf_program * bpf_object__find_program_by_title(const struct bpf_object *obj, const char *title); +LIBBPF_API struct bpf_program * +bpf_object__find_program_by_name(const struct bpf_object *obj, + const char *name); LIBBPF_API struct bpf_object *bpf_object__next(struct bpf_object *prev); #define bpf_object__for_each_safe(pos, tmp) \ @@ -185,6 +189,7 @@ LIBBPF_API void *bpf_program__priv(const struct bpf_program *prog); LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex); +LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog); LIBBPF_API const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 990c7c0e2d9f..5a7630748eeb 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -212,6 +212,8 @@ LIBBPF_0.0.6 { LIBBPF_0.0.7 { global: btf_dump__emit_type_decl; + bpf_object__find_program_by_name; bpf_program__attach; + bpf_program__name; btf__align_of; } LIBBPF_0.0.6; diff --git a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c index d90acc13d1ec..563e12120e77 100644 --- a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c +++ b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c @@ -16,14 +16,11 @@ struct rdonly_map_subtest { void test_rdonly_maps(void) { - const char *prog_name_skip_loop = "raw_tracepoint/sys_enter:skip_loop"; - const char *prog_name_part_loop = "raw_tracepoint/sys_enter:part_loop"; - const char *prog_name_full_loop = "raw_tracepoint/sys_enter:full_loop"; const char *file = "test_rdonly_maps.o"; struct rdonly_map_subtest subtests[] = { - { "skip loop", prog_name_skip_loop, 0, 0 }, - { "part loop", prog_name_part_loop, 3, 2 + 3 + 4 }, - { "full loop", prog_name_full_loop, 4, 2 + 3 + 4 + 5 }, + { "skip loop", "skip_loop", 0, 0 }, + { "part loop", "part_loop", 3, 2 + 3 + 4 }, + { "full loop", "full_loop", 4, 2 + 3 + 4 + 5 }, }; int i, err, zero = 0, duration = 0; struct bpf_link *link = NULL; @@ -50,7 +47,7 @@ void test_rdonly_maps(void) if (!test__start_subtest(t->subtest_name)) continue; - prog = bpf_object__find_program_by_title(obj, t->prog_name); + prog = bpf_object__find_program_by_name(obj, t->prog_name); if (CHECK(!prog, "find_prog", "prog '%s' not found\n", t->prog_name)) goto cleanup; From eba9c5f498a11a8558e099d14692c24f2fea27e3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:33 -0800 Subject: [PATCH 041/127] libbpf: Refactor global data map initialization Refactor global data map initialization to use anonymous mmap()-ed memory instead of malloc()-ed one. This allows to do a transparent re-mmap()-ing of already existing memory address to point to BPF map's memory after bpf_object__load() step (done in follow up patch). This choreographed setup allows to have a nice and unsurprising way to pre-initialize read-only (and r/w as well) maps by user and after BPF map creation keep working with mmap()-ed contents of this map. All in a way that doesn't require user code to update any pointers: the illusion of working with memory contents is preserved before and after actual BPF map instantiation. Selftests and runqslower example demonstrate this feature in follow up patches. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-10-andriin@fb.com --- tools/lib/bpf/libbpf.c | 97 +++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 38 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e7a6b57d849c..06dfa36ed0bb 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -221,16 +221,12 @@ struct bpf_map { void *priv; bpf_map_clear_priv_t clear_priv; enum libbpf_map_type libbpf_type; + void *mmaped; char *pin_path; bool pinned; bool reused; }; -struct bpf_secdata { - void *rodata; - void *data; -}; - static LIST_HEAD(bpf_objects_list); struct bpf_object { @@ -243,7 +239,6 @@ struct bpf_object { struct bpf_map *maps; size_t nr_maps; size_t maps_cap; - struct bpf_secdata sections; bool loaded; bool has_pseudo_calls; @@ -828,13 +823,24 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) return &obj->maps[obj->nr_maps++]; } +static size_t bpf_map_mmap_sz(const struct bpf_map *map) +{ + long page_sz = sysconf(_SC_PAGE_SIZE); + size_t map_sz; + + map_sz = roundup(map->def.value_size, 8) * map->def.max_entries; + map_sz = roundup(map_sz, page_sz); + return map_sz; +} + static int bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, - int sec_idx, Elf_Data *data, void **data_buff) + int sec_idx, void *data, size_t data_sz) { char map_name[BPF_OBJ_NAME_LEN]; struct bpf_map_def *def; struct bpf_map *map; + int err; map = bpf_object__add_map(obj); if (IS_ERR(map)) @@ -854,7 +860,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, def = &map->def; def->type = BPF_MAP_TYPE_ARRAY; def->key_size = sizeof(int); - def->value_size = data->d_size; + def->value_size = data_sz; def->max_entries = 1; def->map_flags = type == LIBBPF_MAP_RODATA ? BPF_F_RDONLY_PROG : 0; def->map_flags |= BPF_F_MMAPABLE; @@ -862,16 +868,20 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", map_name, map->sec_idx, map->sec_offset, def->map_flags); - if (data_buff) { - *data_buff = malloc(data->d_size); - if (!*data_buff) { - zfree(&map->name); - pr_warn("failed to alloc map content buffer\n"); - return -ENOMEM; - } - memcpy(*data_buff, data->d_buf, data->d_size); + map->mmaped = mmap(NULL, bpf_map_mmap_sz(map), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (map->mmaped == MAP_FAILED) { + err = -errno; + map->mmaped = NULL; + pr_warn("failed to alloc map '%s' content buffer: %d\n", + map->name, err); + zfree(&map->name); + return err; } + if (type != LIBBPF_MAP_BSS) + memcpy(map->mmaped, data, data_sz); + pr_debug("map %td is \"%s\"\n", map - obj->maps, map->name); return 0; } @@ -886,23 +896,24 @@ static int bpf_object__init_global_data_maps(struct bpf_object *obj) if (obj->efile.data_shndx >= 0) { err = bpf_object__init_internal_map(obj, LIBBPF_MAP_DATA, obj->efile.data_shndx, - obj->efile.data, - &obj->sections.data); + obj->efile.data->d_buf, + obj->efile.data->d_size); if (err) return err; } if (obj->efile.rodata_shndx >= 0) { err = bpf_object__init_internal_map(obj, LIBBPF_MAP_RODATA, obj->efile.rodata_shndx, - obj->efile.rodata, - &obj->sections.rodata); + obj->efile.rodata->d_buf, + obj->efile.rodata->d_size); if (err) return err; } if (obj->efile.bss_shndx >= 0) { err = bpf_object__init_internal_map(obj, LIBBPF_MAP_BSS, obj->efile.bss_shndx, - obj->efile.bss, NULL); + NULL, + obj->efile.bss->d_size); if (err) return err; } @@ -2292,27 +2303,32 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) { char *cp, errmsg[STRERR_BUFSIZE]; int err, zero = 0; - __u8 *data; /* Nothing to do here since kernel already zero-initializes .bss map. */ if (map->libbpf_type == LIBBPF_MAP_BSS) return 0; - data = map->libbpf_type == LIBBPF_MAP_DATA ? - obj->sections.data : obj->sections.rodata; + err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0); + if (err) { + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error setting initial map(%s) contents: %s\n", + map->name, cp); + return err; + } - err = bpf_map_update_elem(map->fd, &zero, data, 0); /* Freeze .rodata map as read-only from syscall side. */ - if (!err && map->libbpf_type == LIBBPF_MAP_RODATA) { + if (map->libbpf_type == LIBBPF_MAP_RODATA) { err = bpf_map_freeze(map->fd); if (err) { - cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); pr_warn("Error freezing map(%s) as read-only: %s\n", map->name, cp); - err = 0; + return err; } } - return err; + return 0; } static int @@ -4683,17 +4699,22 @@ void bpf_object__close(struct bpf_object *obj) btf_ext__free(obj->btf_ext); for (i = 0; i < obj->nr_maps; i++) { - zfree(&obj->maps[i].name); - zfree(&obj->maps[i].pin_path); - if (obj->maps[i].clear_priv) - obj->maps[i].clear_priv(&obj->maps[i], - obj->maps[i].priv); - obj->maps[i].priv = NULL; - obj->maps[i].clear_priv = NULL; + struct bpf_map *map = &obj->maps[i]; + + if (map->clear_priv) + map->clear_priv(map, map->priv); + map->priv = NULL; + map->clear_priv = NULL; + + if (map->mmaped) { + munmap(map->mmaped, bpf_map_mmap_sz(map)); + map->mmaped = NULL; + } + + zfree(&map->name); + zfree(&map->pin_path); } - zfree(&obj->sections.rodata); - zfree(&obj->sections.data); zfree(&obj->maps); obj->nr_maps = 0; From 13acb508ae203075134327551a6705e6e8f23d48 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:34 -0800 Subject: [PATCH 042/127] libbpf: Postpone BTF ID finding for TRACING programs to load phase Move BTF ID determination for BPF_PROG_TYPE_TRACING programs to a load phase. Performing it at open step is inconvenient, because it prevents BPF skeleton generation on older host kernel, which doesn't contain BTF_KIND_FUNCs information in vmlinux BTF. This is a common set up, though, when, e.g., selftests are compiled on older host kernel, but the test program itself is executed in qemu VM with bleeding edge kernel. Having this BTF searching performed at load time allows to successfully use bpf_object__open() for codegen and inspection of BPF object file. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-11-andriin@fb.com --- tools/lib/bpf/libbpf.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 06dfa36ed0bb..3488ac4f7015 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3815,11 +3815,22 @@ out: return ret; } -int -bpf_program__load(struct bpf_program *prog, - char *license, __u32 kern_version) +static int libbpf_find_attach_btf_id(const char *name, + enum bpf_attach_type attach_type, + __u32 attach_prog_fd); + +int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) { - int err = 0, fd, i; + int err = 0, fd, i, btf_id; + + if (prog->type == BPF_PROG_TYPE_TRACING) { + btf_id = libbpf_find_attach_btf_id(prog->section_name, + prog->expected_attach_type, + prog->attach_prog_fd); + if (btf_id <= 0) + return btf_id; + prog->attach_btf_id = btf_id; + } if (prog->instances.nr < 0 || !prog->instances.fds) { if (prog->preprocessor) { @@ -3843,7 +3854,7 @@ bpf_program__load(struct bpf_program *prog, prog->section_name, prog->instances.nr); } err = load_program(prog, prog->insns, prog->insns_cnt, - license, kern_version, &fd); + license, kern_ver, &fd); if (!err) prog->instances.fds[0] = fd; goto out; @@ -3872,9 +3883,7 @@ bpf_program__load(struct bpf_program *prog, } err = load_program(prog, result.new_insn_ptr, - result.new_insn_cnt, - license, kern_version, &fd); - + result.new_insn_cnt, license, kern_ver, &fd); if (err) { pr_warn("Loading the %dth instance of program '%s' failed\n", i, prog->section_name); @@ -3918,9 +3927,6 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) return 0; } -static int libbpf_find_attach_btf_id(const char *name, - enum bpf_attach_type attach_type, - __u32 attach_prog_fd); static struct bpf_object * __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) @@ -3984,15 +3990,8 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, bpf_program__set_type(prog, prog_type); bpf_program__set_expected_attach_type(prog, attach_type); - if (prog_type == BPF_PROG_TYPE_TRACING) { - err = libbpf_find_attach_btf_id(prog->section_name, - attach_type, - attach_prog_fd); - if (err <= 0) - goto out; - prog->attach_btf_id = err; + if (prog_type == BPF_PROG_TYPE_TRACING) prog->attach_prog_fd = attach_prog_fd; - } } return obj; From 3f51935314b8c0e0f45f28bed2e7a023b2c7627c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:35 -0800 Subject: [PATCH 043/127] libbpf: Reduce log level of supported section names dump It's quite spammy. And now that bpf_object__open() is trying to determine program type from its section name, we are getting these verbose messages all the time. Reduce their log level to DEBUG. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-12-andriin@fb.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3488ac4f7015..b6bd6c47c919 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5196,7 +5196,7 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, pr_warn("failed to guess program type from ELF section '%s'\n", name); type_names = libbpf_get_type_names(false); if (type_names != NULL) { - pr_info("supported section(type) names are:%s\n", type_names); + pr_debug("supported section(type) names are:%s\n", type_names); free(type_names); } From d66562fba1ce66975bd61b0786fb8b1810f33caa Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:36 -0800 Subject: [PATCH 044/127] libbpf: Add BPF object skeleton support Add new set of APIs, allowing to open/load/attach BPF object through BPF object skeleton, generated by bpftool for a specific BPF object file. All the xxx_skeleton() APIs wrap up corresponding bpf_object_xxx() APIs, but additionally also automate map/program lookups by name, global data initialization and mmap()-ing, etc. All this greatly improves and simplifies userspace usability of working with BPF programs. See follow up patches for examples. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-13-andriin@fb.com --- tools/lib/bpf/libbpf.c | 162 +++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 38 +++++++++ tools/lib/bpf/libbpf.map | 5 ++ 3 files changed, 205 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b6bd6c47c919..a1a902fa6e0c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6793,3 +6793,165 @@ int libbpf_num_possible_cpus(void) WRITE_ONCE(cpus, tmp_cpus); return tmp_cpus; } + +int bpf_object__open_skeleton(struct bpf_object_skeleton *s, + const struct bpf_object_open_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, skel_opts, + .object_name = s->name, + ); + struct bpf_object *obj; + int i; + + /* Attempt to preserve opts->object_name, unless overriden by user + * explicitly. Overwriting object name for skeletons is discouraged, + * as it breaks global data maps, because they contain object name + * prefix as their own map name prefix. When skeleton is generated, + * bpftool is making an assumption that this name will stay the same. + */ + if (opts) { + memcpy(&skel_opts, opts, sizeof(*opts)); + if (!opts->object_name) + skel_opts.object_name = s->name; + } + + obj = bpf_object__open_mem(s->data, s->data_sz, &skel_opts); + if (IS_ERR(obj)) { + pr_warn("failed to initialize skeleton BPF object '%s': %ld\n", + s->name, PTR_ERR(obj)); + return PTR_ERR(obj); + } + + *s->obj = obj; + + for (i = 0; i < s->map_cnt; i++) { + struct bpf_map **map = s->maps[i].map; + const char *name = s->maps[i].name; + void **mmaped = s->maps[i].mmaped; + + *map = bpf_object__find_map_by_name(obj, name); + if (!*map) { + pr_warn("failed to find skeleton map '%s'\n", name); + return -ESRCH; + } + + if (mmaped) + *mmaped = (*map)->mmaped; + } + + for (i = 0; i < s->prog_cnt; i++) { + struct bpf_program **prog = s->progs[i].prog; + const char *name = s->progs[i].name; + + *prog = bpf_object__find_program_by_name(obj, name); + if (!*prog) { + pr_warn("failed to find skeleton program '%s'\n", name); + return -ESRCH; + } + } + + return 0; +} + +int bpf_object__load_skeleton(struct bpf_object_skeleton *s) +{ + int i, err; + + err = bpf_object__load(*s->obj); + if (err) { + pr_warn("failed to load BPF skeleton '%s': %d\n", s->name, err); + return err; + } + + for (i = 0; i < s->map_cnt; i++) { + struct bpf_map *map = *s->maps[i].map; + size_t mmap_sz = bpf_map_mmap_sz(map); + int prot, map_fd = bpf_map__fd(map); + void **mmaped = s->maps[i].mmaped; + void *remapped; + + if (!mmaped) + continue; + + if (!(map->def.map_flags & BPF_F_MMAPABLE)) { + *mmaped = NULL; + continue; + } + + if (map->def.map_flags & BPF_F_RDONLY_PROG) + prot = PROT_READ; + else + prot = PROT_READ | PROT_WRITE; + + /* Remap anonymous mmap()-ed "map initialization image" as + * a BPF map-backed mmap()-ed memory, but preserving the same + * memory address. This will cause kernel to change process' + * page table to point to a different piece of kernel memory, + * but from userspace point of view memory address (and its + * contents, being identical at this point) will stay the + * same. This mapping will be released by bpf_object__close() + * as per normal clean up procedure, so we don't need to worry + * about it from skeleton's clean up perspective. + */ + remapped = mmap(*mmaped, mmap_sz, prot, MAP_SHARED | MAP_FIXED, + map_fd, 0); + if (remapped == MAP_FAILED) { + err = -errno; + *mmaped = NULL; + pr_warn("failed to re-mmap() map '%s': %d\n", + bpf_map__name(map), err); + return err; + } + } + + return 0; +} + +int bpf_object__attach_skeleton(struct bpf_object_skeleton *s) +{ + int i; + + for (i = 0; i < s->prog_cnt; i++) { + struct bpf_program *prog = *s->progs[i].prog; + struct bpf_link **link = s->progs[i].link; + const struct bpf_sec_def *sec_def; + const char *sec_name = bpf_program__title(prog, false); + + sec_def = find_sec_def(sec_name); + if (!sec_def || !sec_def->attach_fn) + continue; + + *link = sec_def->attach_fn(sec_def, prog); + if (IS_ERR(*link)) { + pr_warn("failed to auto-attach program '%s': %ld\n", + bpf_program__name(prog), PTR_ERR(*link)); + return PTR_ERR(*link); + } + } + + return 0; +} + +void bpf_object__detach_skeleton(struct bpf_object_skeleton *s) +{ + int i; + + for (i = 0; i < s->prog_cnt; i++) { + struct bpf_link **link = s->progs[i].link; + + if (!IS_ERR_OR_NULL(*link)) + bpf_link__destroy(*link); + *link = NULL; + } +} + +void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s) +{ + if (s->progs) + bpf_object__detach_skeleton(s); + if (s->obj) + bpf_object__close(*s->obj); + free(s->maps); + free(s->progs); + free(s); +} diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f37bd4a3e14b..623191e71415 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -631,6 +631,44 @@ BPF_EMBED_OBJ_DECLARE(NAME) #define BPF_EMBED_OBJ(NAME, PATH) __BPF_EMBED_OBJ(NAME, PATH, 16, ".quad") #endif +struct bpf_map_skeleton { + const char *name; + struct bpf_map **map; + void **mmaped; +}; + +struct bpf_prog_skeleton { + const char *name; + struct bpf_program **prog; + struct bpf_link **link; +}; + +struct bpf_object_skeleton { + size_t sz; /* size of this struct, for forward/backward compatibility */ + + const char *name; + void *data; + size_t data_sz; + + struct bpf_object **obj; + + int map_cnt; + int map_skel_sz; /* sizeof(struct bpf_skeleton_map) */ + struct bpf_map_skeleton *maps; + + int prog_cnt; + int prog_skel_sz; /* sizeof(struct bpf_skeleton_prog) */ + struct bpf_prog_skeleton *progs; +}; + +LIBBPF_API int +bpf_object__open_skeleton(struct bpf_object_skeleton *s, + const struct bpf_object_open_opts *opts); +LIBBPF_API int bpf_object__load_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API int bpf_object__attach_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API void bpf_object__detach_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 5a7630748eeb..1376d992a703 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -213,6 +213,11 @@ LIBBPF_0.0.7 { global: btf_dump__emit_type_decl; bpf_object__find_program_by_name; + bpf_object__attach_skeleton; + bpf_object__destroy_skeleton; + bpf_object__detach_skeleton; + bpf_object__load_skeleton; + bpf_object__open_skeleton; bpf_program__attach; bpf_program__name; btf__align_of; From 985ead416df39d6fe8e89580cc1db6aa273e0175 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:37 -0800 Subject: [PATCH 045/127] bpftool: Add skeleton codegen command Add `bpftool gen skeleton` command, which takes in compiled BPF .o object file and dumps a BPF skeleton struct and related code to work with that skeleton. Skeleton itself is tailored to a specific structure of provided BPF object file, containing accessors (just plain struct fields) for every map and program, as well as dedicated space for bpf_links. If BPF program is using global variables, corresponding structure definitions of compatible memory layout are emitted as well, making it possible to initialize and subsequently read/update global variables values using simple and clear C syntax for accessing fields. This skeleton majorly improves usability of opening/loading/attaching of BPF object, as well as interacting with it throughout the lifetime of loaded BPF object. Generated skeleton struct has the following structure: struct { /* used by libbpf's skeleton API */ struct bpf_object_skeleton *skeleton; /* bpf_object for libbpf APIs */ struct bpf_object *obj; struct { /* for every defined map in BPF object: */ struct bpf_map *; } maps; struct { /* for every program in BPF object: */ struct bpf_program *; } progs; struct { /* for every program in BPF object: */ struct bpf_link *; } links; /* for every present global data section: */ struct __ { /* memory layout of corresponding data section, * with every defined variable represented as a struct field * with exactly the same type, but without const/volatile * modifiers, e.g.: */ int *my_var_1; ... } *; }; This provides great usability improvements: - no need to look up maps and programs by name, instead just my_obj->maps.my_map or my_obj->progs.my_prog would give necessary bpf_map/bpf_program pointers, which user can pass to existing libbpf APIs; - pre-defined places for bpf_links, which will be automatically populated for program types that libbpf knows how to attach automatically (currently tracepoints, kprobe/kretprobe, raw tracepoint and tracing programs). On tearing down skeleton, all active bpf_links will be destroyed (meaning BPF programs will be detached, if they are attached). For cases in which libbpf doesn't know how to auto-attach BPF program, user can manually create link after loading skeleton and they will be auto-detached on skeleton destruction: my_obj->links.my_fancy_prog = bpf_program__attach_cgroup_whatever( my_obj->progs.my_fancy_prog, rodata->my_var = 123; my_obj__load(skel); /* 123 will be initialization value for my_var */ After load, if kernel supports mmap() for BPF arrays, user can still read (and write for .bss and .data) variables values, but at that point it will be directly mmap()-ed to BPF array, backing global variables. This allows to seamlessly exchange data with BPF side. From userspace program's POV, all the pointers and memory contents stay the same, but mapped kernel memory changes to point to created map. If kernel doesn't yet support mmap() for BPF arrays, it's still possible to use those data section structs to pre-initialize .bss, .data, and .rodata, but after load their pointers will be reset to NULL, allowing user code to gracefully handle this condition, if necessary. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-14-andriin@fb.com --- tools/bpf/bpftool/gen.c | 551 +++++++++++++++++++++++++++++++++++++++ tools/bpf/bpftool/main.c | 3 +- tools/bpf/bpftool/main.h | 1 + 3 files changed, 554 insertions(+), 1 deletion(-) create mode 100644 tools/bpf/bpftool/gen.c diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c new file mode 100644 index 000000000000..7379dae35dca --- /dev/null +++ b/tools/bpf/bpftool/gen.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Facebook */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "btf.h" +#include "libbpf_internal.h" +#include "json_writer.h" +#include "main.h" + + +#define MAX_OBJ_NAME_LEN 64 + +static void sanitize_identifier(char *name) +{ + int i; + + for (i = 0; name[i]; i++) + if (!isalnum(name[i]) && name[i] != '_') + name[i] = '_'; +} + +static bool str_has_suffix(const char *str, const char *suffix) +{ + size_t i, n1 = strlen(str), n2 = strlen(suffix); + + if (n1 < n2) + return false; + + for (i = 0; i < n2; i++) { + if (str[n1 - i - 1] != suffix[n2 - i - 1]) + return false; + } + + return true; +} + +static void get_obj_name(char *name, const char *file) +{ + /* Using basename() GNU version which doesn't modify arg. */ + strncpy(name, basename(file), MAX_OBJ_NAME_LEN - 1); + name[MAX_OBJ_NAME_LEN - 1] = '\0'; + if (str_has_suffix(name, ".o")) + name[strlen(name) - 2] = '\0'; + sanitize_identifier(name); +} + +static void get_header_guard(char *guard, const char *obj_name) +{ + int i; + + sprintf(guard, "__%s_SKEL_H__", obj_name); + for (i = 0; guard[i]; i++) + guard[i] = toupper(guard[i]); +} + +static const char *get_map_ident(const struct bpf_map *map) +{ + const char *name = bpf_map__name(map); + + if (!bpf_map__is_internal(map)) + return name; + + if (str_has_suffix(name, ".data")) + return "data"; + else if (str_has_suffix(name, ".rodata")) + return "rodata"; + else if (str_has_suffix(name, ".bss")) + return "bss"; + else + return NULL; +} + +static void codegen_btf_dump_printf(void *ct, const char *fmt, va_list args) +{ + vprintf(fmt, args); +} + +static int codegen_datasec_def(struct bpf_object *obj, + struct btf *btf, + struct btf_dump *d, + const struct btf_type *sec, + const char *obj_name) +{ + const char *sec_name = btf__name_by_offset(btf, sec->name_off); + const struct btf_var_secinfo *sec_var = btf_var_secinfos(sec); + int i, err, off = 0, pad_cnt = 0, vlen = btf_vlen(sec); + const char *sec_ident; + char var_ident[256]; + + if (strcmp(sec_name, ".data") == 0) + sec_ident = "data"; + else if (strcmp(sec_name, ".bss") == 0) + sec_ident = "bss"; + else if (strcmp(sec_name, ".rodata") == 0) + sec_ident = "rodata"; + else + return 0; + + printf(" struct %s__%s {\n", obj_name, sec_ident); + for (i = 0; i < vlen; i++, sec_var++) { + const struct btf_type *var = btf__type_by_id(btf, sec_var->type); + const char *var_name = btf__name_by_offset(btf, var->name_off); + DECLARE_LIBBPF_OPTS(btf_dump_emit_type_decl_opts, opts, + .field_name = var_ident, + .indent_level = 2, + ); + int need_off = sec_var->offset, align_off, align; + __u32 var_type_id = var->type; + const struct btf_type *t; + + t = btf__type_by_id(btf, var_type_id); + while (btf_is_mod(t)) { + var_type_id = t->type; + t = btf__type_by_id(btf, var_type_id); + } + + if (off > need_off) { + p_err("Something is wrong for %s's variable #%d: need offset %d, already at %d.\n", + sec_name, i, need_off, off); + return -EINVAL; + } + + align = btf__align_of(btf, var->type); + if (align <= 0) { + p_err("Failed to determine alignment of variable '%s': %d", + var_name, align); + return -EINVAL; + } + + align_off = (off + align - 1) / align * align; + if (align_off != need_off) { + printf("\t\tchar __pad%d[%d];\n", + pad_cnt, need_off - off); + pad_cnt++; + } + + /* sanitize variable name, e.g., for static vars inside + * a function, it's name is '.', + * which we'll turn into a '_' + */ + var_ident[0] = '\0'; + strncat(var_ident, var_name, sizeof(var_ident) - 1); + sanitize_identifier(var_ident); + + printf("\t\t"); + err = btf_dump__emit_type_decl(d, var_type_id, &opts); + if (err) + return err; + printf(";\n"); + + off = sec_var->offset + sec_var->size; + } + printf(" } *%s;\n", sec_ident); + return 0; +} + +static int codegen_datasecs(struct bpf_object *obj, const char *obj_name) +{ + struct btf *btf = bpf_object__btf(obj); + int n = btf__get_nr_types(btf); + struct btf_dump *d; + int i, err = 0; + + d = btf_dump__new(btf, NULL, NULL, codegen_btf_dump_printf); + if (IS_ERR(d)) + return PTR_ERR(d); + + for (i = 1; i <= n; i++) { + const struct btf_type *t = btf__type_by_id(btf, i); + + if (!btf_is_datasec(t)) + continue; + + err = codegen_datasec_def(obj, btf, d, t, obj_name); + if (err) + goto out; + } +out: + btf_dump__free(d); + return err; +} + +static int codegen(const char *template, ...) +{ + const char *src, *end; + int skip_tabs = 0, n; + char *s, *dst; + va_list args; + char c; + + n = strlen(template); + s = malloc(n + 1); + if (!s) + return -ENOMEM; + src = template; + dst = s; + + /* find out "baseline" indentation to skip */ + while ((c = *src++)) { + if (c == '\t') { + skip_tabs++; + } else if (c == '\n') { + break; + } else { + p_err("unrecognized character at pos %td in template '%s'", + src - template - 1, template); + return -EINVAL; + } + } + + while (*src) { + /* skip baseline indentation tabs */ + for (n = skip_tabs; n > 0; n--, src++) { + if (*src != '\t') { + p_err("not enough tabs at pos %td in template '%s'", + src - template - 1, template); + return -EINVAL; + } + } + /* trim trailing whitespace */ + end = strchrnul(src, '\n'); + for (n = end - src; n > 0 && isspace(src[n - 1]); n--) + ; + memcpy(dst, src, n); + dst += n; + if (*end) + *dst++ = '\n'; + src = *end ? end + 1 : end; + } + *dst++ = '\0'; + + /* print out using adjusted template */ + va_start(args, template); + n = vprintf(s, args); + va_end(args); + + free(s); + return n; +} + +static int do_skeleton(int argc, char **argv) +{ + char header_guard[MAX_OBJ_NAME_LEN + sizeof("__SKEL_H__")]; + size_t i, map_cnt = 0, prog_cnt = 0; + char obj_name[MAX_OBJ_NAME_LEN]; + const char *file, *ident; + struct bpf_program *prog; + struct bpf_object *obj; + struct bpf_map *map; + struct btf *btf; + int err = -1; + + if (!REQ_ARGS(1)) { + usage(); + return -1; + } + file = GET_ARG(); + + if (argc) { + p_err("extra unknown arguments"); + return -1; + } + + obj = bpf_object__open_file(file, NULL); + if (IS_ERR(obj)) { + p_err("failed to open BPF object file: %ld", PTR_ERR(obj)); + return -1; + } + + get_obj_name(obj_name, file); + get_header_guard(header_guard, obj_name); + + bpf_object__for_each_map(map, obj) { + ident = get_map_ident(map); + if (!ident) { + p_err("ignoring unrecognized internal map '%s'...", + bpf_map__name(map)); + continue; + } + map_cnt++; + } + bpf_object__for_each_program(prog, obj) { + prog_cnt++; + } + + codegen("\ + \n\ + /* THIS FILE IS AUTOGENERATED! */ \n\ + #ifndef %2$s \n\ + #define %2$s \n\ + \n\ + #include \n\ + #include \n\ + \n\ + struct %1$s { \n\ + struct bpf_object_skeleton *skeleton; \n\ + struct bpf_object *obj; \n\ + ", + obj_name, header_guard + ); + + if (map_cnt) { + printf("\tstruct {\n"); + bpf_object__for_each_map(map, obj) { + ident = get_map_ident(map); + if (!ident) + continue; + printf("\t\tstruct bpf_map *%s;\n", ident); + } + printf("\t} maps;\n"); + } + + if (prog_cnt) { + printf("\tstruct {\n"); + bpf_object__for_each_program(prog, obj) { + printf("\t\tstruct bpf_program *%s;\n", + bpf_program__name(prog)); + } + printf("\t} progs;\n"); + printf("\tstruct {\n"); + bpf_object__for_each_program(prog, obj) { + printf("\t\tstruct bpf_link *%s;\n", + bpf_program__name(prog)); + } + printf("\t} links;\n"); + } + + btf = bpf_object__btf(obj); + if (btf) { + err = codegen_datasecs(obj, obj_name); + if (err) + goto out; + } + + codegen("\ + \n\ + }; \n\ + \n\ + static inline struct bpf_object_skeleton * \n\ + %1$s__create_skeleton(struct %1$s *obj, struct bpf_embed_data *embed)\n\ + { \n\ + struct bpf_object_skeleton *s; \n\ + \n\ + s = calloc(1, sizeof(*s)); \n\ + if (!s) \n\ + return NULL; \n\ + \n\ + s->sz = sizeof(*s); \n\ + s->name = \"%1$s\"; \n\ + s->data = embed->data; \n\ + s->data_sz = embed->size; \n\ + s->obj = &obj->obj; \n\ + ", + obj_name + ); + if (map_cnt) { + codegen("\ + \n\ + \n\ + /* maps */ \n\ + s->map_cnt = %zu; \n\ + s->map_skel_sz = sizeof(*s->maps); \n\ + s->maps = calloc(s->map_cnt, s->map_skel_sz);\n\ + if (!s->maps) \n\ + goto err; \n\ + ", + map_cnt + ); + i = 0; + bpf_object__for_each_map(map, obj) { + const char *ident = get_map_ident(map); + + if (!ident) + continue; + + codegen("\ + \n\ + \n\ + s->maps[%zu].name = \"%s\"; \n\ + s->maps[%zu].map = &obj->maps.%s; \n\ + ", + i, bpf_map__name(map), i, ident); + /* memory-mapped internal maps */ + if (bpf_map__is_internal(map) && + (bpf_map__def(map)->map_flags & BPF_F_MMAPABLE)) { + printf("\ts->maps[%zu].mmaped = (void **)&obj->%s;\n", + i, ident); + } + i++; + } + } + if (prog_cnt) { + codegen("\ + \n\ + \n\ + /* programs */ \n\ + s->prog_cnt = %zu; \n\ + s->prog_skel_sz = sizeof(*s->progs); \n\ + s->progs = calloc(s->prog_cnt, s->prog_skel_sz);\n\ + if (!s->progs) \n\ + goto err; \n\ + ", + prog_cnt + ); + i = 0; + bpf_object__for_each_program(prog, obj) { + codegen("\ + \n\ + \n\ + s->progs[%1$zu].name = \"%2$s\"; \n\ + s->progs[%1$zu].prog = &obj->progs.%2$s;\n\ + s->progs[%1$zu].link = &obj->links.%2$s;\n\ + ", + i, bpf_program__name(prog)); + i++; + } + } + codegen("\ + \n\ + \n\ + return s; \n\ + err: \n\ + bpf_object__destroy_skeleton(s); \n\ + return NULL; \n\ + } \n\ + \n\ + static void \n\ + %1$s__destroy(struct %1$s *obj) \n\ + { \n\ + if (!obj) \n\ + return; \n\ + if (obj->skeleton) \n\ + bpf_object__destroy_skeleton(obj->skeleton);\n\ + free(obj); \n\ + } \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open_opts(struct bpf_embed_data *embed, const struct bpf_object_open_opts *opts)\n\ + { \n\ + struct %1$s *obj; \n\ + \n\ + obj = calloc(1, sizeof(*obj)); \n\ + if (!obj) \n\ + return NULL; \n\ + \n\ + obj->skeleton = %1$s__create_skeleton(obj, embed); \n\ + if (!obj->skeleton) \n\ + goto err; \n\ + \n\ + if (bpf_object__open_skeleton(obj->skeleton, opts)) \n\ + goto err; \n\ + \n\ + return obj; \n\ + err: \n\ + %1$s__destroy(obj); \n\ + return NULL; \n\ + } \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open(struct bpf_embed_data *embed) \n\ + { \n\ + return %1$s__open_opts(embed, NULL); \n\ + } \n\ + \n\ + static inline int \n\ + %1$s__load(struct %1$s *obj) \n\ + { \n\ + return bpf_object__load_skeleton(obj->skeleton); \n\ + } \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open_and_load(struct bpf_embed_data *embed) \n\ + { \n\ + struct %1$s *obj; \n\ + \n\ + obj = %1$s__open(embed); \n\ + if (!obj) \n\ + return NULL; \n\ + if (%1$s__load(obj)) { \n\ + %1$s__destroy(obj); \n\ + return NULL; \n\ + } \n\ + return obj; \n\ + } \n\ + \n\ + static inline int \n\ + %1$s__attach(struct %1$s *obj) \n\ + { \n\ + return bpf_object__attach_skeleton(obj->skeleton); \n\ + } \n\ + \n\ + static inline void \n\ + %1$s__detach(struct %1$s *obj) \n\ + { \n\ + return bpf_object__detach_skeleton(obj->skeleton); \n\ + } \n\ + \n\ + #endif /* %2$s */ \n\ + ", + obj_name, header_guard + ); + err = 0; +out: + bpf_object__close(obj); + return err; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s gen skeleton FILE\n" + " %1$s gen help\n" + "\n" + " " HELP_SPEC_OPTIONS "\n" + "", + bin_name); + + return 0; +} + +static const struct cmd cmds[] = { + { "skeleton", do_skeleton }, + { "help", do_help }, + { 0 } +}; + +int do_gen(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 4764581ff9ea..1fe91c558508 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -58,7 +58,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | cgroup | perf | net | feature | btf }\n" + " OBJECT := { prog | map | cgroup | perf | net | feature | btf | gen }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -227,6 +227,7 @@ static const struct cmd cmds[] = { { "net", do_net }, { "feature", do_feature }, { "btf", do_btf }, + { "gen", do_gen }, { "version", do_version }, { 0 } }; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 81890f4c8cca..4e75b58d3989 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -155,6 +155,7 @@ int do_net(int argc, char **arg); int do_tracelog(int argc, char **arg); int do_feature(int argc, char **argv); int do_btf(int argc, char **argv); +int do_gen(int argc, char **argv); int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); From f3c926a4df2cddf6230c3f56b1f43e439552cdad Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:38 -0800 Subject: [PATCH 046/127] selftests/bpf: Add BPF skeletons selftests and convert attach_probe.c Add BPF skeleton generation to selftest/bpf's Makefile. Convert attach_probe.c to use skeleton. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-15-andriin@fb.com --- tools/testing/selftests/bpf/.gitignore | 2 + tools/testing/selftests/bpf/Makefile | 36 +++-- .../selftests/bpf/prog_tests/attach_probe.c | 135 +++++------------- .../selftests/bpf/progs/test_attach_probe.c | 34 ++--- 4 files changed, 74 insertions(+), 133 deletions(-) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 419652458da4..ce5af95ede42 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -38,5 +38,7 @@ test_hashmap test_btf_dump xdping test_cpp +*.skel.h /no_alu32 /bpf_gcc +/tools diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 90de7d75b5c6..f70c8e735120 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -3,10 +3,12 @@ include ../../../../scripts/Kbuild.include include ../../../scripts/Makefile.arch CURDIR := $(abspath .) -LIBDIR := $(abspath ../../../lib) +TOOLSDIR := $(abspath ../../..) +LIBDIR := $(TOOLSDIR)/lib BPFDIR := $(LIBDIR)/bpf -TOOLSDIR := $(abspath ../../../include) -APIDIR := $(TOOLSDIR)/uapi +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi GENDIR := $(abspath ../../../../include/generated) GENHDR := $(GENDIR)/autoconf.h @@ -19,7 +21,7 @@ LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) CFLAGS += -g -Wall -O2 $(GENFLAGS) -I$(APIDIR) -I$(LIBDIR) -I$(BPFDIR) \ - -I$(GENDIR) -I$(TOOLSDIR) -I$(CURDIR) \ + -I$(GENDIR) -I$(TOOLSINCDIR) -I$(CURDIR) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lrt -lpthread @@ -117,6 +119,12 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c # force a rebuild of BPFOBJ when its dependencies are updated force: +DEFAULT_BPFTOOL := $(OUTPUT)/tools/usr/local/sbin/bpftool +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +$(DEFAULT_BPFTOOL): force + $(MAKE) -C $(BPFTOOLDIR) DESTDIR=$(OUTPUT)/tools install + $(BPFOBJ): force $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ @@ -180,6 +188,8 @@ define GCC_BPF_BUILD_RULE $(BPF_GCC) $3 $4 -O2 -c $1 -o $2 endef +SKEL_BLACKLIST := btf__% test_pinning_invalid.c + # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: @@ -195,8 +205,11 @@ TRUNNER_EXTRA_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \ $$(filter %.c,$(TRUNNER_EXTRA_SOURCES))) TRUNNER_EXTRA_HDRS := $$(filter %.h,$(TRUNNER_EXTRA_SOURCES)) TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h -TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \ - $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c))) +TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c)) +TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS)) +TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ + $$(filter-out $(SKEL_BLACKLIST), \ + $$(TRUNNER_BPF_SRCS))) # Evaluate rules now with extra TRUNNER_XXX variables above already defined $$(eval $$(call DEFINE_TEST_RUNNER_RULES,$1,$2)) @@ -226,6 +239,11 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS), \ $(TRUNNER_BPF_LDFLAGS)) + +$(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h: \ + $(TRUNNER_OUTPUT)/%.o \ + | $(BPFTOOL) $(TRUNNER_OUTPUT) + $$(BPFTOOL) gen skeleton $$< > $$@ endif # ensure we set up tests.h header generation rule just once @@ -245,6 +263,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_TESTS_DIR)/%.c \ $(TRUNNER_EXTRA_HDRS) \ $(TRUNNER_BPF_OBJS) \ + $(TRUNNER_BPF_SKELS) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) cd $$(@D) && $$(CC) $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) @@ -255,9 +274,9 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ +# only copy extra resources if in flavored build $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) ifneq ($2,) - # only copy extra resources if in flavored build cp -a $$^ $(TRUNNER_OUTPUT)/ endif @@ -323,4 +342,5 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(BPFOBJ) EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ - feature $(OUTPUT)/*.o $(OUTPUT)/no_alu32 $(OUTPUT)/bpf_gcc + feature $(OUTPUT)/*.o $(OUTPUT)/no_alu32 $(OUTPUT)/bpf_gcc \ + tools *.skel.h diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index b2e7c1424b07..60da1d08daa0 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "test_attach_probe.skel.h" ssize_t get_base_addr() { size_t start; @@ -25,26 +26,10 @@ BPF_EMBED_OBJ(probe, "test_attach_probe.o"); void test_attach_probe(void) { - const char *kprobe_name = "kprobe/sys_nanosleep"; - const char *kretprobe_name = "kretprobe/sys_nanosleep"; - const char *uprobe_name = "uprobe/trigger_func"; - const char *uretprobe_name = "uretprobe/trigger_func"; - const int kprobe_idx = 0, kretprobe_idx = 1; - const int uprobe_idx = 2, uretprobe_idx = 3; - const char *obj_name = "attach_probe"; - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts, - .object_name = obj_name, - .relaxed_maps = true, - ); - struct bpf_program *kprobe_prog, *kretprobe_prog; - struct bpf_program *uprobe_prog, *uretprobe_prog; - struct bpf_object *obj; - int err, duration = 0, res; - struct bpf_link *kprobe_link = NULL; - struct bpf_link *kretprobe_link = NULL; - struct bpf_link *uprobe_link = NULL; - struct bpf_link *uretprobe_link = NULL; - int results_map_fd; + int duration = 0; + struct bpf_link *kprobe_link, *kretprobe_link; + struct bpf_link *uprobe_link, *uretprobe_link; + struct test_attach_probe* skel; size_t uprobe_offset; ssize_t base_addr; @@ -54,124 +39,68 @@ void test_attach_probe(void) return; uprobe_offset = (size_t)&get_base_addr - base_addr; - /* open object */ - obj = bpf_object__open_mem(probe_embed.data, probe_embed.size, - &open_opts); - if (CHECK(IS_ERR(obj), "obj_open_mem", "err %ld\n", PTR_ERR(obj))) + skel = test_attach_probe__open_and_load(&probe_embed); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) return; - - if (CHECK(strcmp(bpf_object__name(obj), obj_name), "obj_name", - "wrong obj name '%s', expected '%s'\n", - bpf_object__name(obj), obj_name)) + if (CHECK(!skel->bss, "check_bss", ".bss wasn't mmap()-ed\n")) goto cleanup; - kprobe_prog = bpf_object__find_program_by_title(obj, kprobe_name); - if (CHECK(!kprobe_prog, "find_probe", - "prog '%s' not found\n", kprobe_name)) - goto cleanup; - kretprobe_prog = bpf_object__find_program_by_title(obj, kretprobe_name); - if (CHECK(!kretprobe_prog, "find_probe", - "prog '%s' not found\n", kretprobe_name)) - goto cleanup; - uprobe_prog = bpf_object__find_program_by_title(obj, uprobe_name); - if (CHECK(!uprobe_prog, "find_probe", - "prog '%s' not found\n", uprobe_name)) - goto cleanup; - uretprobe_prog = bpf_object__find_program_by_title(obj, uretprobe_name); - if (CHECK(!uretprobe_prog, "find_probe", - "prog '%s' not found\n", uretprobe_name)) - goto cleanup; - - /* create maps && load programs */ - err = bpf_object__load(obj); - if (CHECK(err, "obj_load", "err %d\n", err)) - goto cleanup; - - /* load maps */ - results_map_fd = bpf_find_map(__func__, obj, "results_map"); - if (CHECK(results_map_fd < 0, "find_results_map", - "err %d\n", results_map_fd)) - goto cleanup; - - kprobe_link = bpf_program__attach_kprobe(kprobe_prog, + kprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kprobe, false /* retprobe */, SYS_NANOSLEEP_KPROBE_NAME); if (CHECK(IS_ERR(kprobe_link), "attach_kprobe", - "err %ld\n", PTR_ERR(kprobe_link))) { - kprobe_link = NULL; + "err %ld\n", PTR_ERR(kprobe_link))) goto cleanup; - } - kretprobe_link = bpf_program__attach_kprobe(kretprobe_prog, + skel->links.handle_kprobe = kprobe_link; + + kretprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kretprobe, true /* retprobe */, SYS_NANOSLEEP_KPROBE_NAME); if (CHECK(IS_ERR(kretprobe_link), "attach_kretprobe", - "err %ld\n", PTR_ERR(kretprobe_link))) { - kretprobe_link = NULL; + "err %ld\n", PTR_ERR(kretprobe_link))) goto cleanup; - } - uprobe_link = bpf_program__attach_uprobe(uprobe_prog, + skel->links.handle_kretprobe = kretprobe_link; + + uprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uprobe, false /* retprobe */, 0 /* self pid */, "/proc/self/exe", uprobe_offset); if (CHECK(IS_ERR(uprobe_link), "attach_uprobe", - "err %ld\n", PTR_ERR(uprobe_link))) { - uprobe_link = NULL; + "err %ld\n", PTR_ERR(uprobe_link))) goto cleanup; - } - uretprobe_link = bpf_program__attach_uprobe(uretprobe_prog, + skel->links.handle_uprobe = uprobe_link; + + uretprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uretprobe, true /* retprobe */, -1 /* any pid */, "/proc/self/exe", uprobe_offset); if (CHECK(IS_ERR(uretprobe_link), "attach_uretprobe", - "err %ld\n", PTR_ERR(uretprobe_link))) { - uretprobe_link = NULL; + "err %ld\n", PTR_ERR(uretprobe_link))) goto cleanup; - } + skel->links.handle_uretprobe = uretprobe_link; /* trigger & validate kprobe && kretprobe */ usleep(1); - err = bpf_map_lookup_elem(results_map_fd, &kprobe_idx, &res); - if (CHECK(err, "get_kprobe_res", - "failed to get kprobe res: %d\n", err)) + if (CHECK(skel->bss->kprobe_res != 1, "check_kprobe_res", + "wrong kprobe res: %d\n", skel->bss->kprobe_res)) goto cleanup; - if (CHECK(res != kprobe_idx + 1, "check_kprobe_res", - "wrong kprobe res: %d\n", res)) - goto cleanup; - - err = bpf_map_lookup_elem(results_map_fd, &kretprobe_idx, &res); - if (CHECK(err, "get_kretprobe_res", - "failed to get kretprobe res: %d\n", err)) - goto cleanup; - if (CHECK(res != kretprobe_idx + 1, "check_kretprobe_res", - "wrong kretprobe res: %d\n", res)) + if (CHECK(skel->bss->kretprobe_res != 2, "check_kretprobe_res", + "wrong kretprobe res: %d\n", skel->bss->kretprobe_res)) goto cleanup; /* trigger & validate uprobe & uretprobe */ get_base_addr(); - err = bpf_map_lookup_elem(results_map_fd, &uprobe_idx, &res); - if (CHECK(err, "get_uprobe_res", - "failed to get uprobe res: %d\n", err)) + if (CHECK(skel->bss->uprobe_res != 3, "check_uprobe_res", + "wrong uprobe res: %d\n", skel->bss->uprobe_res)) goto cleanup; - if (CHECK(res != uprobe_idx + 1, "check_uprobe_res", - "wrong uprobe res: %d\n", res)) - goto cleanup; - - err = bpf_map_lookup_elem(results_map_fd, &uretprobe_idx, &res); - if (CHECK(err, "get_uretprobe_res", - "failed to get uretprobe res: %d\n", err)) - goto cleanup; - if (CHECK(res != uretprobe_idx + 1, "check_uretprobe_res", - "wrong uretprobe res: %d\n", res)) + if (CHECK(skel->bss->uretprobe_res != 4, "check_uretprobe_res", + "wrong uretprobe res: %d\n", skel->bss->uretprobe_res)) goto cleanup; cleanup: - bpf_link__destroy(kprobe_link); - bpf_link__destroy(kretprobe_link); - bpf_link__destroy(uprobe_link); - bpf_link__destroy(uretprobe_link); - bpf_object__close(obj); + test_attach_probe__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c index 534621e38906..221b69700625 100644 --- a/tools/testing/selftests/bpf/progs/test_attach_probe.c +++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c @@ -5,46 +5,36 @@ #include #include "bpf_helpers.h" -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 4); - __type(key, int); - __type(value, int); -} results_map SEC(".maps"); +int kprobe_res = 0; +int kretprobe_res = 0; +int uprobe_res = 0; +int uretprobe_res = 0; SEC("kprobe/sys_nanosleep") -int handle_sys_nanosleep_entry(struct pt_regs *ctx) +int handle_kprobe(struct pt_regs *ctx) { - const int key = 0, value = 1; - - bpf_map_update_elem(&results_map, &key, &value, 0); + kprobe_res = 1; return 0; } SEC("kretprobe/sys_nanosleep") -int handle_sys_getpid_return(struct pt_regs *ctx) +int handle_kretprobe(struct pt_regs *ctx) { - const int key = 1, value = 2; - - bpf_map_update_elem(&results_map, &key, &value, 0); + kretprobe_res = 2; return 0; } SEC("uprobe/trigger_func") -int handle_uprobe_entry(struct pt_regs *ctx) +int handle_uprobe(struct pt_regs *ctx) { - const int key = 2, value = 3; - - bpf_map_update_elem(&results_map, &key, &value, 0); + uprobe_res = 3; return 0; } SEC("uretprobe/trigger_func") -int handle_uprobe_return(struct pt_regs *ctx) +int handle_uretprobe(struct pt_regs *ctx) { - const int key = 3, value = 4; - - bpf_map_update_elem(&results_map, &key, &value, 0); + uretprobe_res = 4; return 0; } From dde53c1b763b5038545efa5d812758ce589654e1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:39 -0800 Subject: [PATCH 047/127] selftests/bpf: Convert few more selftest to skeletons Convert few more selftests to use generated BPF skeletons as a demonstration on how to use it. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-16-andriin@fb.com --- .../selftests/bpf/prog_tests/fentry_fexit.c | 105 ++++++------------ .../selftests/bpf/prog_tests/fentry_test.c | 72 +++++------- tools/testing/selftests/bpf/prog_tests/mmap.c | 60 ++++------ .../bpf/prog_tests/stacktrace_build_id.c | 79 +++++-------- .../bpf/prog_tests/stacktrace_build_id_nmi.c | 84 ++++++-------- 5 files changed, 150 insertions(+), 250 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c index 40bcff2cc274..110fcf053fd0 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c @@ -1,90 +1,59 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include +#include "test_pkt_access.skel.h" +#include "fentry_test.skel.h" +#include "fexit_test.skel.h" + +BPF_EMBED_OBJ(pkt_access, "test_pkt_access.o"); +BPF_EMBED_OBJ(fentry, "fentry_test.o"); +BPF_EMBED_OBJ(fexit, "fexit_test.o"); void test_fentry_fexit(void) { - struct bpf_prog_load_attr attr_fentry = { - .file = "./fentry_test.o", - }; - struct bpf_prog_load_attr attr_fexit = { - .file = "./fexit_test.o", - }; + struct test_pkt_access *pkt_skel = NULL; + struct fentry_test *fentry_skel = NULL; + struct fexit_test *fexit_skel = NULL; + __u64 *fentry_res, *fexit_res; + __u32 duration = 0, retval; + int err, pkt_fd, i; - struct bpf_object *obj_fentry = NULL, *obj_fexit = NULL, *pkt_obj; - struct bpf_map *data_map_fentry, *data_map_fexit; - char fentry_name[] = "fentry/bpf_fentry_testX"; - char fexit_name[] = "fexit/bpf_fentry_testX"; - int err, pkt_fd, kfree_skb_fd, i; - struct bpf_link *link[12] = {}; - struct bpf_program *prog[12]; - __u32 duration, retval; - const int zero = 0; - u64 result[12]; - - err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS, - &pkt_obj, &pkt_fd); - if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno)) + pkt_skel = test_pkt_access__open_and_load(&pkt_access_embed); + if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n")) return; - err = bpf_prog_load_xattr(&attr_fentry, &obj_fentry, &kfree_skb_fd); - if (CHECK(err, "prog_load fail", "err %d errno %d\n", err, errno)) + fentry_skel = fentry_test__open_and_load(&fentry_embed); + if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) goto close_prog; - err = bpf_prog_load_xattr(&attr_fexit, &obj_fexit, &kfree_skb_fd); - if (CHECK(err, "prog_load fail", "err %d errno %d\n", err, errno)) + fexit_skel = fexit_test__open_and_load(&fexit_embed); + if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) goto close_prog; - for (i = 0; i < 6; i++) { - fentry_name[sizeof(fentry_name) - 2] = '1' + i; - prog[i] = bpf_object__find_program_by_title(obj_fentry, fentry_name); - if (CHECK(!prog[i], "find_prog", "prog %s not found\n", fentry_name)) - goto close_prog; - link[i] = bpf_program__attach_trace(prog[i]); - if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) - goto close_prog; - } - data_map_fentry = bpf_object__find_map_by_name(obj_fentry, "fentry_t.bss"); - if (CHECK(!data_map_fentry, "find_data_map", "data map not found\n")) - goto close_prog; - - for (i = 6; i < 12; i++) { - fexit_name[sizeof(fexit_name) - 2] = '1' + i - 6; - prog[i] = bpf_object__find_program_by_title(obj_fexit, fexit_name); - if (CHECK(!prog[i], "find_prog", "prog %s not found\n", fexit_name)) - goto close_prog; - link[i] = bpf_program__attach_trace(prog[i]); - if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) - goto close_prog; - } - data_map_fexit = bpf_object__find_map_by_name(obj_fexit, "fexit_te.bss"); - if (CHECK(!data_map_fexit, "find_data_map", "data map not found\n")) + err = fentry_test__attach(fentry_skel); + if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err)) + goto close_prog; + err = fexit_test__attach(fexit_skel); + if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) goto close_prog; + pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access); err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), NULL, NULL, &retval, &duration); CHECK(err || retval, "ipv6", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration); - err = bpf_map_lookup_elem(bpf_map__fd(data_map_fentry), &zero, &result); - if (CHECK(err, "get_result", - "failed to get output data: %d\n", err)) - goto close_prog; - - err = bpf_map_lookup_elem(bpf_map__fd(data_map_fexit), &zero, result + 6); - if (CHECK(err, "get_result", - "failed to get output data: %d\n", err)) - goto close_prog; - - for (i = 0; i < 12; i++) - if (CHECK(result[i] != 1, "result", "bpf_fentry_test%d failed err %ld\n", - i % 6 + 1, result[i])) - goto close_prog; + fentry_res = (__u64 *)fentry_skel->bss; + fexit_res = (__u64 *)fexit_skel->bss; + printf("%lld\n", fentry_skel->bss->test1_result); + for (i = 0; i < 6; i++) { + CHECK(fentry_res[i] != 1, "result", + "fentry_test%d failed err %lld\n", i + 1, fentry_res[i]); + CHECK(fexit_res[i] != 1, "result", + "fexit_test%d failed err %lld\n", i + 1, fexit_res[i]); + } close_prog: - for (i = 0; i < 12; i++) - if (!IS_ERR_OR_NULL(link[i])) - bpf_link__destroy(link[i]); - bpf_object__close(obj_fentry); - bpf_object__close(obj_fexit); - bpf_object__close(pkt_obj); + test_pkt_access__destroy(pkt_skel); + fentry_test__destroy(fentry_skel); + fexit_test__destroy(fexit_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index 9fb103193878..46a4afdf507a 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -1,64 +1,46 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include +#include "test_pkt_access.skel.h" +#include "fentry_test.skel.h" + +BPF_EMBED_OBJ_DECLARE(pkt_access); +BPF_EMBED_OBJ_DECLARE(fentry); void test_fentry_test(void) { - struct bpf_prog_load_attr attr = { - .file = "./fentry_test.o", - }; - - char prog_name[] = "fentry/bpf_fentry_testX"; - struct bpf_object *obj = NULL, *pkt_obj; - int err, pkt_fd, kfree_skb_fd, i; - struct bpf_link *link[6] = {}; - struct bpf_program *prog[6]; + struct test_pkt_access *pkt_skel = NULL; + struct fentry_test *fentry_skel = NULL; + int err, pkt_fd, i; __u32 duration, retval; - struct bpf_map *data_map; - const int zero = 0; - u64 result[6]; + __u64 *result; - err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS, - &pkt_obj, &pkt_fd); - if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno)) + pkt_skel = test_pkt_access__open_and_load(&pkt_access_embed); + if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n")) return; - err = bpf_prog_load_xattr(&attr, &obj, &kfree_skb_fd); - if (CHECK(err, "prog_load fail", "err %d errno %d\n", err, errno)) - goto close_prog; + fentry_skel = fentry_test__open_and_load(&fentry_embed); + if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) + goto cleanup; - for (i = 0; i < 6; i++) { - prog_name[sizeof(prog_name) - 2] = '1' + i; - prog[i] = bpf_object__find_program_by_title(obj, prog_name); - if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name)) - goto close_prog; - link[i] = bpf_program__attach_trace(prog[i]); - if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) - goto close_prog; - } - data_map = bpf_object__find_map_by_name(obj, "fentry_t.bss"); - if (CHECK(!data_map, "find_data_map", "data map not found\n")) - goto close_prog; + err = fentry_test__attach(fentry_skel); + if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err)) + goto cleanup; + pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access); err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), NULL, NULL, &retval, &duration); CHECK(err || retval, "ipv6", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration); - err = bpf_map_lookup_elem(bpf_map__fd(data_map), &zero, &result); - if (CHECK(err, "get_result", - "failed to get output data: %d\n", err)) - goto close_prog; + result = (__u64 *)fentry_skel->bss; + for (i = 0; i < 6; i++) { + if (CHECK(result[i] != 1, "result", + "fentry_test%d failed err %lld\n", i + 1, result[i])) + goto cleanup; + } - for (i = 0; i < 6; i++) - if (CHECK(result[i] != 1, "result", "bpf_fentry_test%d failed err %ld\n", - i + 1, result[i])) - goto close_prog; - -close_prog: - for (i = 0; i < 6; i++) - if (!IS_ERR_OR_NULL(link[i])) - bpf_link__destroy(link[i]); - bpf_object__close(obj); - bpf_object__close(pkt_obj); +cleanup: + fentry_test__destroy(fentry_skel); + test_pkt_access__destroy(pkt_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 051a6d48762c..95a44d37ccea 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -1,59 +1,41 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include "test_mmap.skel.h" struct map_data { __u64 val[512 * 4]; }; -struct bss_data { - __u64 in_val; - __u64 out_val; -}; - static size_t roundup_page(size_t sz) { long page_size = sysconf(_SC_PAGE_SIZE); return (sz + page_size - 1) / page_size * page_size; } +BPF_EMBED_OBJ(test_mmap, "test_mmap.o"); + void test_mmap(void) { - const char *file = "test_mmap.o"; - const char *probe_name = "raw_tracepoint/sys_enter"; - const char *tp_name = "sys_enter"; - const size_t bss_sz = roundup_page(sizeof(struct bss_data)); + const size_t bss_sz = roundup_page(sizeof(struct test_mmap__bss)); const size_t map_sz = roundup_page(sizeof(struct map_data)); const int zero = 0, one = 1, two = 2, far = 1500; const long page_size = sysconf(_SC_PAGE_SIZE); int err, duration = 0, i, data_map_fd; - struct bpf_program *prog; - struct bpf_object *obj; - struct bpf_link *link = NULL; struct bpf_map *data_map, *bss_map; void *bss_mmaped = NULL, *map_mmaped = NULL, *tmp1, *tmp2; - volatile struct bss_data *bss_data; - volatile struct map_data *map_data; + struct test_mmap__bss *bss_data; + struct map_data *map_data; + struct test_mmap *skel; __u64 val = 0; - obj = bpf_object__open_file("test_mmap.o", NULL); - if (CHECK(IS_ERR(obj), "obj_open", "failed to open '%s': %ld\n", - file, PTR_ERR(obj))) - return; - prog = bpf_object__find_program_by_title(obj, probe_name); - if (CHECK(!prog, "find_probe", "prog '%s' not found\n", probe_name)) - goto cleanup; - err = bpf_object__load(obj); - if (CHECK(err, "obj_load", "failed to load prog '%s': %d\n", - probe_name, err)) - goto cleanup; - bss_map = bpf_object__find_map_by_name(obj, "test_mma.bss"); - if (CHECK(!bss_map, "find_bss_map", ".bss map not found\n")) - goto cleanup; - data_map = bpf_object__find_map_by_name(obj, "data_map"); - if (CHECK(!data_map, "find_data_map", "data_map map not found\n")) - goto cleanup; + skel = test_mmap__open_and_load(&test_mmap_embed); + if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + return; + + bss_map = skel->maps.bss; + data_map = skel->maps.data_map; data_map_fd = bpf_map__fd(data_map); bss_mmaped = mmap(NULL, bss_sz, PROT_READ | PROT_WRITE, MAP_SHARED, @@ -77,13 +59,15 @@ void test_mmap(void) CHECK_FAIL(bss_data->in_val); CHECK_FAIL(bss_data->out_val); + CHECK_FAIL(skel->bss->in_val); + CHECK_FAIL(skel->bss->out_val); CHECK_FAIL(map_data->val[0]); CHECK_FAIL(map_data->val[1]); CHECK_FAIL(map_data->val[2]); CHECK_FAIL(map_data->val[far]); - link = bpf_program__attach_raw_tracepoint(prog, tp_name); - if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link))) + err = test_mmap__attach(skel); + if (CHECK(err, "attach_raw_tp", "err %d\n", err)) goto cleanup; bss_data->in_val = 123; @@ -94,6 +78,8 @@ void test_mmap(void) CHECK_FAIL(bss_data->in_val != 123); CHECK_FAIL(bss_data->out_val != 123); + CHECK_FAIL(skel->bss->in_val != 123); + CHECK_FAIL(skel->bss->out_val != 123); CHECK_FAIL(map_data->val[0] != 111); CHECK_FAIL(map_data->val[1] != 222); CHECK_FAIL(map_data->val[2] != 123); @@ -160,6 +146,8 @@ void test_mmap(void) usleep(1); CHECK_FAIL(bss_data->in_val != 321); CHECK_FAIL(bss_data->out_val != 321); + CHECK_FAIL(skel->bss->in_val != 321); + CHECK_FAIL(skel->bss->out_val != 321); CHECK_FAIL(map_data->val[0] != 111); CHECK_FAIL(map_data->val[1] != 222); CHECK_FAIL(map_data->val[2] != 321); @@ -203,6 +191,8 @@ void test_mmap(void) map_data = tmp2; CHECK_FAIL(bss_data->in_val != 321); CHECK_FAIL(bss_data->out_val != 321); + CHECK_FAIL(skel->bss->in_val != 321); + CHECK_FAIL(skel->bss->out_val != 321); CHECK_FAIL(map_data->val[0] != 111); CHECK_FAIL(map_data->val[1] != 222); CHECK_FAIL(map_data->val[2] != 321); @@ -214,7 +204,5 @@ cleanup: CHECK_FAIL(munmap(bss_mmaped, bss_sz)); if (map_mmaped) CHECK_FAIL(munmap(map_mmaped, map_sz)); - if (!IS_ERR_OR_NULL(link)) - bpf_link__destroy(link); - bpf_object__close(obj); + test_mmap__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c index d841dced971f..4af8b8253f25 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c @@ -1,16 +1,16 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "test_stacktrace_build_id.skel.h" + +BPF_EMBED_OBJ(stacktrace_build_id, "test_stacktrace_build_id.o"); void test_stacktrace_build_id(void) { + int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; - const char *prog_name = "tracepoint/random/urandom_read"; - const char *file = "./test_stacktrace_build_id.o"; - int err, prog_fd, stack_trace_len; + struct test_stacktrace_build_id *skel; + int err, stack_trace_len; __u32 key, previous_key, val, duration = 0; - struct bpf_program *prog; - struct bpf_object *obj; - struct bpf_link *link = NULL; char buf[256]; int i, j; struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH]; @@ -18,43 +18,24 @@ void test_stacktrace_build_id(void) int retry = 1; retry: - err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); - if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno)) + skel = test_stacktrace_build_id__open_and_load(&stacktrace_build_id_embed); + if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) return; - prog = bpf_object__find_program_by_title(obj, prog_name); - if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name)) - goto close_prog; - - link = bpf_program__attach_tracepoint(prog, "random", "urandom_read"); - if (CHECK(IS_ERR(link), "attach_tp", "err %ld\n", PTR_ERR(link))) - goto close_prog; + err = test_stacktrace_build_id__attach(skel); + if (CHECK(err, "attach_tp", "err %d\n", err)) + goto cleanup; /* find map fds */ - control_map_fd = bpf_find_map(__func__, obj, "control_map"); - if (CHECK(control_map_fd < 0, "bpf_find_map control_map", - "err %d errno %d\n", err, errno)) - goto disable_pmu; - - stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap"); - if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap", - "err %d errno %d\n", err, errno)) - goto disable_pmu; - - stackmap_fd = bpf_find_map(__func__, obj, "stackmap"); - if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n", - err, errno)) - goto disable_pmu; - - stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap"); - if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap", - "err %d errno %d\n", err, errno)) - goto disable_pmu; + control_map_fd = bpf_map__fd(skel->maps.control_map); + stackid_hmap_fd = bpf_map__fd(skel->maps.stackid_hmap); + stackmap_fd = bpf_map__fd(skel->maps.stackmap); + stack_amap_fd = bpf_map__fd(skel->maps.stack_amap); if (CHECK_FAIL(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null"))) - goto disable_pmu; + goto cleanup; if (CHECK_FAIL(system("./urandom_read"))) - goto disable_pmu; + goto cleanup; /* disable stack trace collection */ key = 0; val = 1; @@ -66,23 +47,23 @@ retry: err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto cleanup; err = compare_map_keys(stackmap_fd, stackid_hmap_fd); if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto cleanup; err = extract_build_id(buf, 256); if (CHECK(err, "get build_id with readelf", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto cleanup; err = bpf_map_get_next_key(stackmap_fd, NULL, &key); if (CHECK(err, "get_next_key from stackmap", "err %d, errno %d\n", err, errno)) - goto disable_pmu; + goto cleanup; do { char build_id[64]; @@ -90,7 +71,7 @@ retry: err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs); if (CHECK(err, "lookup_elem from stackmap", "err %d, errno %d\n", err, errno)) - goto disable_pmu; + goto cleanup; for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i) if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID && id_offs[i].offset != 0) { @@ -108,8 +89,7 @@ retry: * try it one more time. */ if (build_id_matches < 1 && retry--) { - bpf_link__destroy(link); - bpf_object__close(obj); + test_stacktrace_build_id__destroy(skel); printf("%s:WARN:Didn't find expected build ID from the map, retrying\n", __func__); goto retry; @@ -117,17 +97,14 @@ retry: if (CHECK(build_id_matches < 1, "build id match", "Didn't find expected build ID from the map\n")) - goto disable_pmu; + goto cleanup; - stack_trace_len = PERF_MAX_STACK_DEPTH - * sizeof(struct bpf_stack_build_id); + stack_trace_len = PERF_MAX_STACK_DEPTH * + sizeof(struct bpf_stack_build_id); err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len); CHECK(err, "compare_stack_ips stackmap vs. stack_amap", "err %d errno %d\n", err, errno); -disable_pmu: - bpf_link__destroy(link); - -close_prog: - bpf_object__close(obj); +cleanup: + test_stacktrace_build_id__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index f62aa0eb959b..32fb03881a7b 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "test_stacktrace_build_id.skel.h" static __u64 read_perf_max_sample_freq(void) { @@ -14,21 +15,19 @@ static __u64 read_perf_max_sample_freq(void) return sample_freq; } +BPF_EMBED_OBJ_DECLARE(stacktrace_build_id); + void test_stacktrace_build_id_nmi(void) { - int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; - const char *prog_name = "tracepoint/random/urandom_read"; - const char *file = "./test_stacktrace_build_id.o"; - int err, pmu_fd, prog_fd; + int control_map_fd, stackid_hmap_fd, stackmap_fd; + struct test_stacktrace_build_id *skel; + int err, pmu_fd; struct perf_event_attr attr = { .freq = 1, .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, }; __u32 key, previous_key, val, duration = 0; - struct bpf_program *prog; - struct bpf_object *obj; - struct bpf_link *link; char buf[256]; int i, j; struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH]; @@ -38,13 +37,16 @@ void test_stacktrace_build_id_nmi(void) attr.sample_freq = read_perf_max_sample_freq(); retry: - err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd); - if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno)) + skel = test_stacktrace_build_id__open(&stacktrace_build_id_embed); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; - prog = bpf_object__find_program_by_title(obj, prog_name); - if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name)) - goto close_prog; + /* override program type */ + bpf_program__set_perf_event(skel->progs.oncpu); + + err = test_stacktrace_build_id__load(skel); + if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err)) + goto cleanup; pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu 0 */, -1 /* group id */, @@ -52,40 +54,25 @@ retry: if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n", pmu_fd, errno)) - goto close_prog; + goto cleanup; - link = bpf_program__attach_perf_event(prog, pmu_fd); - if (CHECK(IS_ERR(link), "attach_perf_event", - "err %ld\n", PTR_ERR(link))) { + skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu, + pmu_fd); + if (CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event", + "err %ld\n", PTR_ERR(skel->links.oncpu))) { close(pmu_fd); - goto close_prog; + goto cleanup; } /* find map fds */ - control_map_fd = bpf_find_map(__func__, obj, "control_map"); - if (CHECK(control_map_fd < 0, "bpf_find_map control_map", - "err %d errno %d\n", err, errno)) - goto disable_pmu; - - stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap"); - if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap", - "err %d errno %d\n", err, errno)) - goto disable_pmu; - - stackmap_fd = bpf_find_map(__func__, obj, "stackmap"); - if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n", - err, errno)) - goto disable_pmu; - - stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap"); - if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap", - "err %d errno %d\n", err, errno)) - goto disable_pmu; + control_map_fd = bpf_map__fd(skel->maps.control_map); + stackid_hmap_fd = bpf_map__fd(skel->maps.stackid_hmap); + stackmap_fd = bpf_map__fd(skel->maps.stackmap); if (CHECK_FAIL(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null"))) - goto disable_pmu; + goto cleanup; if (CHECK_FAIL(system("taskset 0x1 ./urandom_read 100000"))) - goto disable_pmu; + goto cleanup; /* disable stack trace collection */ key = 0; val = 1; @@ -97,23 +84,23 @@ retry: err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto cleanup; err = compare_map_keys(stackmap_fd, stackid_hmap_fd); if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto cleanup; err = extract_build_id(buf, 256); if (CHECK(err, "get build_id with readelf", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto cleanup; err = bpf_map_get_next_key(stackmap_fd, NULL, &key); if (CHECK(err, "get_next_key from stackmap", "err %d, errno %d\n", err, errno)) - goto disable_pmu; + goto cleanup; do { char build_id[64]; @@ -121,7 +108,7 @@ retry: err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs); if (CHECK(err, "lookup_elem from stackmap", "err %d, errno %d\n", err, errno)) - goto disable_pmu; + goto cleanup; for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i) if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID && id_offs[i].offset != 0) { @@ -139,8 +126,7 @@ retry: * try it one more time. */ if (build_id_matches < 1 && retry--) { - bpf_link__destroy(link); - bpf_object__close(obj); + test_stacktrace_build_id__destroy(skel); printf("%s:WARN:Didn't find expected build ID from the map, retrying\n", __func__); goto retry; @@ -148,7 +134,7 @@ retry: if (CHECK(build_id_matches < 1, "build id match", "Didn't find expected build ID from the map\n")) - goto disable_pmu; + goto cleanup; /* * We intentionally skip compare_stack_ips(). This is because we @@ -157,8 +143,6 @@ retry: * BPF_STACK_BUILD_ID_IP; */ -disable_pmu: - bpf_link__destroy(link); -close_prog: - bpf_object__close(obj); +cleanup: + test_stacktrace_build_id__destroy(skel); } From 197448eaac1ab330fb485769bccb62346ba1f458 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:40 -0800 Subject: [PATCH 048/127] selftests/bpf: Add test validating data section to struct convertion layout Add a simple selftests validating datasection-to-struct layour dumping. Global variables are constructed in such a way as to cause both natural and artificial padding (through custom alignment requirement). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191214014341.3442258-17-andriin@fb.com --- .../selftests/bpf/prog_tests/skeleton.c | 51 +++++++++++++++++++ .../selftests/bpf/progs/test_skeleton.c | 37 ++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/skeleton.c create mode 100644 tools/testing/selftests/bpf/progs/test_skeleton.c diff --git a/tools/testing/selftests/bpf/prog_tests/skeleton.c b/tools/testing/selftests/bpf/prog_tests/skeleton.c new file mode 100644 index 000000000000..79f8d13e6740 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/skeleton.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include + +struct s { + int a; + long long b; +} __attribute__((packed)); + +#include "test_skeleton.skel.h" + +BPF_EMBED_OBJ(skeleton, "test_skeleton.o"); + +void test_skeleton(void) +{ + int duration = 0, err; + struct test_skeleton* skel; + struct test_skeleton__bss *bss; + + skel = test_skeleton__open_and_load(&skeleton_embed); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + + bss = skel->bss; + bss->in1 = 1; + bss->in2 = 2; + bss->in3 = 3; + bss->in4 = 4; + bss->in5.a = 5; + bss->in5.b = 6; + + err = test_skeleton__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* trigger tracepoint */ + usleep(1); + + CHECK(bss->out1 != 1, "res1", "got %d != exp %d\n", bss->out1, 1); + CHECK(bss->out2 != 2, "res2", "got %lld != exp %d\n", bss->out2, 2); + CHECK(bss->out3 != 3, "res3", "got %d != exp %d\n", (int)bss->out3, 3); + CHECK(bss->out4 != 4, "res4", "got %lld != exp %d\n", bss->out4, 4); + CHECK(bss->handler_out5.a != 5, "res5", "got %d != exp %d\n", + bss->handler_out5.a, 5); + CHECK(bss->handler_out5.b != 6, "res6", "got %lld != exp %d\n", + bss->handler_out5.b, 6); + +cleanup: + test_skeleton__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_skeleton.c b/tools/testing/selftests/bpf/progs/test_skeleton.c new file mode 100644 index 000000000000..c0013d2e16f2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_skeleton.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include +#include "bpf_helpers.h" + +struct s { + int a; + long long b; +} __attribute__((packed)); + +int in1 = 0; +long long in2 = 0; +char in3 = '\0'; +long long in4 __attribute__((aligned(64))) = 0; +struct s in5 = {}; + +long long out2 = 0; +char out3 = 0; +long long out4 = 0; +int out1 = 0; + + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + static volatile struct s out5; + + out1 = in1; + out2 = in2; + out3 = in3; + out4 = in4; + out5 = in5; + return 0; +} + +char _license[] SEC("license") = "GPL"; From d9c00c3b1639a3c8f46663cc042a3768d222021f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:43:41 -0800 Subject: [PATCH 049/127] bpftool: Add `gen skeleton` BASH completions Add BASH completions for gen sub-command. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Cc: Quentin Monnet Link: https://lore.kernel.org/bpf/20191214014341.3442258-18-andriin@fb.com --- tools/bpf/bpftool/bash-completion/bpftool | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 21c676a1eeb1..754d8395e451 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -839,6 +839,17 @@ _bpftool() ;; esac ;; + gen) + case $command in + skeleton) + _filedir + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'skeleton help' -- "$cur" ) ) + ;; + esac + ;; cgroup) case $command in show|list|tree) From ac9d1389631a4bee0df47e50d6bee8b94230759d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:47:07 -0800 Subject: [PATCH 050/127] libbpf: Extract internal map names into constants Instead of duplicating string literals, keep them in one place and consistent. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191214014710.3449601-2-andriin@fb.com --- tools/lib/bpf/libbpf.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a1a902fa6e0c..f6c135869701 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -195,6 +195,10 @@ struct bpf_program { __u32 prog_flags; }; +#define DATA_SEC ".data" +#define BSS_SEC ".bss" +#define RODATA_SEC ".rodata" + enum libbpf_map_type { LIBBPF_MAP_UNSPEC, LIBBPF_MAP_DATA, @@ -203,9 +207,9 @@ enum libbpf_map_type { }; static const char * const libbpf_type_to_btf_name[] = { - [LIBBPF_MAP_DATA] = ".data", - [LIBBPF_MAP_BSS] = ".bss", - [LIBBPF_MAP_RODATA] = ".rodata", + [LIBBPF_MAP_DATA] = DATA_SEC, + [LIBBPF_MAP_BSS] = BSS_SEC, + [LIBBPF_MAP_RODATA] = RODATA_SEC, }; struct bpf_map { @@ -736,13 +740,13 @@ int bpf_object__section_size(const struct bpf_object *obj, const char *name, *size = 0; if (!name) { return -EINVAL; - } else if (!strcmp(name, ".data")) { + } else if (!strcmp(name, DATA_SEC)) { if (obj->efile.data) *size = obj->efile.data->d_size; - } else if (!strcmp(name, ".bss")) { + } else if (!strcmp(name, BSS_SEC)) { if (obj->efile.bss) *size = obj->efile.bss->d_size; - } else if (!strcmp(name, ".rodata")) { + } else if (!strcmp(name, RODATA_SEC)) { if (obj->efile.rodata) *size = obj->efile.rodata->d_size; } else { @@ -1684,10 +1688,10 @@ static int bpf_object__elf_collect(struct bpf_object *obj) name, obj->path, cp); return err; } - } else if (strcmp(name, ".data") == 0) { + } else if (strcmp(name, DATA_SEC) == 0) { obj->efile.data = data; obj->efile.data_shndx = idx; - } else if (strcmp(name, ".rodata") == 0) { + } else if (strcmp(name, RODATA_SEC) == 0) { obj->efile.rodata = data; obj->efile.rodata_shndx = idx; } else { @@ -1717,7 +1721,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) obj->efile.reloc_sects[nr_sects].shdr = sh; obj->efile.reloc_sects[nr_sects].data = data; - } else if (sh.sh_type == SHT_NOBITS && strcmp(name, ".bss") == 0) { + } else if (sh.sh_type == SHT_NOBITS && + strcmp(name, BSS_SEC) == 0) { obj->efile.bss = data; obj->efile.bss_shndx = idx; } else { From 166750bc1dd256b2184b22588fb9fe6d3fbb93ae Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:47:08 -0800 Subject: [PATCH 051/127] libbpf: Support libbpf-provided extern variables Add support for extern variables, provided to BPF program by libbpf. Currently the following extern variables are supported: - LINUX_KERNEL_VERSION; version of a kernel in which BPF program is executing, follows KERNEL_VERSION() macro convention, can be 4- and 8-byte long; - CONFIG_xxx values; a set of values of actual kernel config. Tristate, boolean, strings, and integer values are supported. Set of possible values is determined by declared type of extern variable. Supported types of variables are: - Tristate values. Are represented as `enum libbpf_tristate`. Accepted values are **strictly** 'y', 'n', or 'm', which are represented as TRI_YES, TRI_NO, or TRI_MODULE, respectively. - Boolean values. Are represented as bool (_Bool) types. Accepted values are 'y' and 'n' only, turning into true/false values, respectively. - Single-character values. Can be used both as a substritute for bool/tristate, or as a small-range integer: - 'y'/'n'/'m' are represented as is, as characters 'y', 'n', or 'm'; - integers in a range [-128, 127] or [0, 255] (depending on signedness of char in target architecture) are recognized and represented with respective values of char type. - Strings. String values are declared as fixed-length char arrays. String of up to that length will be accepted and put in first N bytes of char array, with the rest of bytes zeroed out. If config string value is longer than space alloted, it will be truncated and warning message emitted. Char array is always zero terminated. String literals in config have to be enclosed in double quotes, just like C-style string literals. - Integers. 8-, 16-, 32-, and 64-bit integers are supported, both signed and unsigned variants. Libbpf enforces parsed config value to be in the supported range of corresponding integer type. Integers values in config can be: - decimal integers, with optional + and - signs; - hexadecimal integers, prefixed with 0x or 0X; - octal integers, starting with 0. Config file itself is searched in /boot/config-$(uname -r) location with fallback to /proc/config.gz, unless config path is specified explicitly through bpf_object_open_opts' kernel_config_path option. Both gzipped and plain text formats are supported. Libbpf adds explicit dependency on zlib because of this, but this shouldn't be a problem, given libelf already depends on zlib. All detected extern variables, are put into a separate .extern internal map. It, similarly to .rodata map, is marked as read-only from BPF program side, as well as is frozen on load. This allows BPF verifier to track extern values as constants and perform enhanced branch prediction and dead code elimination. This can be relied upon for doing kernel version/feature detection and using potentially unsupported field relocations or BPF helpers in a CO-RE-based BPF program, while still having a single version of BPF program running on old and new kernels. Selftests are validating this explicitly for unexisting BPF helper. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191214014710.3449601-3-andriin@fb.com --- include/uapi/linux/btf.h | 3 +- tools/include/uapi/linux/btf.h | 7 +- tools/lib/bpf/Makefile | 15 +- tools/lib/bpf/bpf_helpers.h | 9 + tools/lib/bpf/btf.c | 9 +- tools/lib/bpf/libbpf.c | 738 +++++++++++++++++++++++++-- tools/lib/bpf/libbpf.h | 12 +- tools/testing/selftests/bpf/Makefile | 2 +- 8 files changed, 729 insertions(+), 66 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index c02dec97e1ce..1a2898c482ee 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -142,7 +142,8 @@ struct btf_param { enum { BTF_VAR_STATIC = 0, - BTF_VAR_GLOBAL_ALLOCATED, + BTF_VAR_GLOBAL_ALLOCATED = 1, + BTF_VAR_GLOBAL_EXTERN = 2, }; /* BTF_KIND_VAR is followed by a single "struct btf_var" to describe diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 63ae4a39e58b..1a2898c482ee 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -22,9 +22,9 @@ struct btf_header { }; /* Max # of type identifier */ -#define BTF_MAX_TYPE 0x0000ffff +#define BTF_MAX_TYPE 0x000fffff /* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x0000ffff +#define BTF_MAX_NAME_OFFSET 0x00ffffff /* Max # of struct/union/enum members or func args */ #define BTF_MAX_VLEN 0xffff @@ -142,7 +142,8 @@ struct btf_param { enum { BTF_VAR_STATIC = 0, - BTF_VAR_GLOBAL_ALLOCATED, + BTF_VAR_GLOBAL_ALLOCATED = 1, + BTF_VAR_GLOBAL_EXTERN = 2, }; /* BTF_KIND_VAR is followed by a single "struct btf_var" to describe diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 23ae06c43d08..a3718cb275f2 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -56,8 +56,8 @@ ifndef VERBOSE endif FEATURE_USER = .libbpf -FEATURE_TESTS = libelf libelf-mmap bpf reallocarray -FEATURE_DISPLAY = libelf bpf +FEATURE_TESTS = libelf libelf-mmap zlib bpf reallocarray +FEATURE_DISPLAY = libelf zlib bpf INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi FEATURE_CHECK_CFLAGS-bpf = $(INCLUDES) @@ -160,7 +160,7 @@ all: fixdep all_cmd: $(CMD_TARGETS) check -$(BPF_IN_SHARED): force elfdep bpfdep bpf_helper_defs.h +$(BPF_IN_SHARED): force elfdep zdep bpfdep bpf_helper_defs.h @(test -f ../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \ (diff -B ../../include/uapi/linux/bpf.h ../../../include/uapi/linux/bpf.h >/dev/null) || \ echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h'" >&2 )) || true @@ -178,7 +178,7 @@ $(BPF_IN_SHARED): force elfdep bpfdep bpf_helper_defs.h echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(SHARED_OBJDIR) CFLAGS="$(CFLAGS) $(SHLIB_FLAGS)" -$(BPF_IN_STATIC): force elfdep bpfdep bpf_helper_defs.h +$(BPF_IN_STATIC): force elfdep zdep bpfdep bpf_helper_defs.h $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) bpf_helper_defs.h: $(srctree)/tools/include/uapi/linux/bpf.h @@ -190,7 +190,7 @@ $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) $(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN_SHARED) $(QUIET_LINK)$(CC) $(LDFLAGS) \ --shared -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \ - -Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -o $@ + -Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -lz -o $@ @ln -sf $(@F) $(OUTPUT)libbpf.so @ln -sf $(@F) $(OUTPUT)libbpf.so.$(LIBBPF_MAJOR_VERSION) @@ -279,12 +279,15 @@ clean: -PHONY += force elfdep bpfdep cscope tags +PHONY += force elfdep zdep bpfdep cscope tags force: elfdep: @if [ "$(feature-libelf)" != "1" ]; then echo "No libelf found"; exit 1 ; fi +zdep: + @if [ "$(feature-zlib)" != "1" ]; then echo "No zlib found"; exit 1 ; fi + bpfdep: @if [ "$(feature-bpf)" != "1" ]; then echo "BPF API too old"; exit 1 ; fi diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 0c7d28292898..aa46700075e1 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -25,6 +25,9 @@ #ifndef __always_inline #define __always_inline __attribute__((always_inline)) #endif +#ifndef __weak +#define __weak __attribute__((weak)) +#endif /* * Helper structure used by eBPF C program @@ -44,4 +47,10 @@ enum libbpf_pin_type { LIBBPF_PIN_BY_NAME, }; +enum libbpf_tristate { + TRI_NO = 0, + TRI_YES = 1, + TRI_MODULE = 2, +}; + #endif diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 84fe82f27bef..520021939d81 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -578,6 +578,12 @@ static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf, return -ENOENT; } + /* .extern datasec size and var offsets were set correctly during + * extern collection step, so just skip straight to sorting variables + */ + if (t->size) + goto sort_vars; + ret = bpf_object__section_size(obj, name, &size); if (ret || !size || (t->size && t->size != size)) { pr_debug("Invalid size for section %s: %u bytes\n", name, size); @@ -614,7 +620,8 @@ static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf, vsi->offset = off; } - qsort(t + 1, vars, sizeof(*vsi), compare_vsi_off); +sort_vars: + qsort(btf_var_secinfos(t), vars, sizeof(*vsi), compare_vsi_off); return 0; } diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f6c135869701..26144706ba0b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -44,6 +44,7 @@ #include #include #include +#include #include "libbpf.h" #include "bpf.h" @@ -139,6 +140,20 @@ struct bpf_capabilities { __u32 array_mmap:1; }; +enum reloc_type { + RELO_LD64, + RELO_CALL, + RELO_DATA, + RELO_EXTERN, +}; + +struct reloc_desc { + enum reloc_type type; + int insn_idx; + int map_idx; + int sym_off; +}; + /* * bpf_prog should be a better name but it has been used in * linux/filter.h. @@ -157,16 +172,7 @@ struct bpf_program { size_t insns_cnt, main_prog_cnt; enum bpf_prog_type type; - struct reloc_desc { - enum { - RELO_LD64, - RELO_CALL, - RELO_DATA, - } type; - int insn_idx; - int map_idx; - int sym_off; - } *reloc_desc; + struct reloc_desc *reloc_desc; int nr_reloc; int log_level; @@ -198,18 +204,21 @@ struct bpf_program { #define DATA_SEC ".data" #define BSS_SEC ".bss" #define RODATA_SEC ".rodata" +#define EXTERN_SEC ".extern" enum libbpf_map_type { LIBBPF_MAP_UNSPEC, LIBBPF_MAP_DATA, LIBBPF_MAP_BSS, LIBBPF_MAP_RODATA, + LIBBPF_MAP_EXTERN, }; static const char * const libbpf_type_to_btf_name[] = { [LIBBPF_MAP_DATA] = DATA_SEC, [LIBBPF_MAP_BSS] = BSS_SEC, [LIBBPF_MAP_RODATA] = RODATA_SEC, + [LIBBPF_MAP_EXTERN] = EXTERN_SEC, }; struct bpf_map { @@ -231,6 +240,28 @@ struct bpf_map { bool reused; }; +enum extern_type { + EXT_UNKNOWN, + EXT_CHAR, + EXT_BOOL, + EXT_INT, + EXT_TRISTATE, + EXT_CHAR_ARR, +}; + +struct extern_desc { + const char *name; + int sym_idx; + int btf_id; + enum extern_type type; + int sz; + int align; + int data_off; + bool is_signed; + bool is_weak; + bool is_set; +}; + static LIST_HEAD(bpf_objects_list); struct bpf_object { @@ -244,6 +275,11 @@ struct bpf_object { size_t nr_maps; size_t maps_cap; + char *kconfig_path; + struct extern_desc *externs; + int nr_extern; + int extern_map_idx; + bool loaded; bool has_pseudo_calls; bool relaxed_core_relocs; @@ -271,6 +307,7 @@ struct bpf_object { int maps_shndx; int btf_maps_shndx; int text_shndx; + int symbols_shndx; int data_shndx; int rodata_shndx; int bss_shndx; @@ -542,6 +579,7 @@ static struct bpf_object *bpf_object__new(const char *path, obj->efile.data_shndx = -1; obj->efile.rodata_shndx = -1; obj->efile.bss_shndx = -1; + obj->extern_map_idx = -1; obj->kern_version = get_kernel_version(); obj->loaded = false; @@ -866,7 +904,8 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, def->key_size = sizeof(int); def->value_size = data_sz; def->max_entries = 1; - def->map_flags = type == LIBBPF_MAP_RODATA ? BPF_F_RDONLY_PROG : 0; + def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_EXTERN + ? BPF_F_RDONLY_PROG : 0; def->map_flags |= BPF_F_MMAPABLE; pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", @@ -883,7 +922,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, return err; } - if (type != LIBBPF_MAP_BSS) + if (data) memcpy(map->mmaped, data, data_sz); pr_debug("map %td is \"%s\"\n", map - obj->maps, map->name); @@ -924,6 +963,271 @@ static int bpf_object__init_global_data_maps(struct bpf_object *obj) return 0; } + +static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, + const void *name) +{ + int i; + + for (i = 0; i < obj->nr_extern; i++) { + if (strcmp(obj->externs[i].name, name) == 0) + return &obj->externs[i]; + } + return NULL; +} + +static int set_ext_value_tri(struct extern_desc *ext, void *ext_val, + char value) +{ + switch (ext->type) { + case EXT_BOOL: + if (value == 'm') { + pr_warn("extern %s=%c should be tristate or char\n", + ext->name, value); + return -EINVAL; + } + *(bool *)ext_val = value == 'y' ? true : false; + break; + case EXT_TRISTATE: + if (value == 'y') + *(enum libbpf_tristate *)ext_val = TRI_YES; + else if (value == 'm') + *(enum libbpf_tristate *)ext_val = TRI_MODULE; + else /* value == 'n' */ + *(enum libbpf_tristate *)ext_val = TRI_NO; + break; + case EXT_CHAR: + *(char *)ext_val = value; + break; + case EXT_UNKNOWN: + case EXT_INT: + case EXT_CHAR_ARR: + default: + pr_warn("extern %s=%c should be bool, tristate, or char\n", + ext->name, value); + return -EINVAL; + } + ext->is_set = true; + return 0; +} + +static int set_ext_value_str(struct extern_desc *ext, char *ext_val, + const char *value) +{ + size_t len; + + if (ext->type != EXT_CHAR_ARR) { + pr_warn("extern %s=%s should char array\n", ext->name, value); + return -EINVAL; + } + + len = strlen(value); + if (value[len - 1] != '"') { + pr_warn("extern '%s': invalid string config '%s'\n", + ext->name, value); + return -EINVAL; + } + + /* strip quotes */ + len -= 2; + if (len >= ext->sz) { + pr_warn("extern '%s': long string config %s of (%zu bytes) truncated to %d bytes\n", + ext->name, value, len, ext->sz - 1); + len = ext->sz - 1; + } + memcpy(ext_val, value + 1, len); + ext_val[len] = '\0'; + ext->is_set = true; + return 0; +} + +static int parse_u64(const char *value, __u64 *res) +{ + char *value_end; + int err; + + errno = 0; + *res = strtoull(value, &value_end, 0); + if (errno) { + err = -errno; + pr_warn("failed to parse '%s' as integer: %d\n", value, err); + return err; + } + if (*value_end) { + pr_warn("failed to parse '%s' as integer completely\n", value); + return -EINVAL; + } + return 0; +} + +static bool is_ext_value_in_range(const struct extern_desc *ext, __u64 v) +{ + int bit_sz = ext->sz * 8; + + if (ext->sz == 8) + return true; + + /* Validate that value stored in u64 fits in integer of `ext->sz` + * bytes size without any loss of information. If the target integer + * is signed, we rely on the following limits of integer type of + * Y bits and subsequent transformation: + * + * -2^(Y-1) <= X <= 2^(Y-1) - 1 + * 0 <= X + 2^(Y-1) <= 2^Y - 1 + * 0 <= X + 2^(Y-1) < 2^Y + * + * For unsigned target integer, check that all the (64 - Y) bits are + * zero. + */ + if (ext->is_signed) + return v + (1ULL << (bit_sz - 1)) < (1ULL << bit_sz); + else + return (v >> bit_sz) == 0; +} + +static int set_ext_value_num(struct extern_desc *ext, void *ext_val, + __u64 value) +{ + if (ext->type != EXT_INT && ext->type != EXT_CHAR) { + pr_warn("extern %s=%llu should be integer\n", + ext->name, value); + return -EINVAL; + } + if (!is_ext_value_in_range(ext, value)) { + pr_warn("extern %s=%llu value doesn't fit in %d bytes\n", + ext->name, value, ext->sz); + return -ERANGE; + } + switch (ext->sz) { + case 1: *(__u8 *)ext_val = value; break; + case 2: *(__u16 *)ext_val = value; break; + case 4: *(__u32 *)ext_val = value; break; + case 8: *(__u64 *)ext_val = value; break; + default: + return -EINVAL; + } + ext->is_set = true; + return 0; +} + +static int bpf_object__read_kernel_config(struct bpf_object *obj, + const char *config_path, + void *data) +{ + char buf[PATH_MAX], *sep, *value; + struct extern_desc *ext; + int len, err = 0; + void *ext_val; + __u64 num; + gzFile file; + + if (config_path) { + file = gzopen(config_path, "r"); + } else { + struct utsname uts; + + uname(&uts); + len = snprintf(buf, PATH_MAX, "/boot/config-%s", uts.release); + if (len < 0) + return -EINVAL; + else if (len >= PATH_MAX) + return -ENAMETOOLONG; + /* gzopen also accepts uncompressed files. */ + file = gzopen(buf, "r"); + if (!file) + file = gzopen("/proc/config.gz", "r"); + } + if (!file) { + pr_warn("failed to read kernel config at '%s'\n", config_path); + return -ENOENT; + } + + while (gzgets(file, buf, sizeof(buf))) { + if (strncmp(buf, "CONFIG_", 7)) + continue; + + sep = strchr(buf, '='); + if (!sep) { + err = -EINVAL; + pr_warn("failed to parse '%s': no separator\n", buf); + goto out; + } + /* Trim ending '\n' */ + len = strlen(buf); + if (buf[len - 1] == '\n') + buf[len - 1] = '\0'; + /* Split on '=' and ensure that a value is present. */ + *sep = '\0'; + if (!sep[1]) { + err = -EINVAL; + *sep = '='; + pr_warn("failed to parse '%s': no value\n", buf); + goto out; + } + + ext = find_extern_by_name(obj, buf); + if (!ext) + continue; + if (ext->is_set) { + err = -EINVAL; + pr_warn("re-defining extern '%s' not allowed\n", buf); + goto out; + } + + ext_val = data + ext->data_off; + value = sep + 1; + + switch (*value) { + case 'y': case 'n': case 'm': + err = set_ext_value_tri(ext, ext_val, *value); + break; + case '"': + err = set_ext_value_str(ext, ext_val, value); + break; + default: + /* assume integer */ + err = parse_u64(value, &num); + if (err) { + pr_warn("extern %s=%s should be integer\n", + ext->name, value); + goto out; + } + err = set_ext_value_num(ext, ext_val, num); + break; + } + if (err) + goto out; + pr_debug("extern %s=%s\n", ext->name, value); + } + +out: + gzclose(file); + return err; +} + +static int bpf_object__init_extern_map(struct bpf_object *obj) +{ + struct extern_desc *last_ext; + size_t map_sz; + int err; + + if (obj->nr_extern == 0) + return 0; + + last_ext = &obj->externs[obj->nr_extern - 1]; + map_sz = last_ext->data_off + last_ext->sz; + + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_EXTERN, + obj->efile.symbols_shndx, + NULL, map_sz); + if (err) + return err; + + obj->extern_map_idx = obj->nr_maps - 1; + + return 0; +} + static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) { Elf_Data *symbols = obj->efile.symbols; @@ -1401,19 +1705,17 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, static int bpf_object__init_maps(struct bpf_object *obj, const struct bpf_object_open_opts *opts) { - const char *pin_root_path = OPTS_GET(opts, pin_root_path, NULL); - bool strict = !OPTS_GET(opts, relaxed_maps, false); + const char *pin_root_path; + bool strict; int err; + strict = !OPTS_GET(opts, relaxed_maps, false); + pin_root_path = OPTS_GET(opts, pin_root_path, NULL); + err = bpf_object__init_user_maps(obj, strict); - if (err) - return err; - - err = bpf_object__init_user_btf_maps(obj, strict, pin_root_path); - if (err) - return err; - - err = bpf_object__init_global_data_maps(obj); + err = err ?: bpf_object__init_user_btf_maps(obj, strict, pin_root_path); + err = err ?: bpf_object__init_global_data_maps(obj); + err = err ?: bpf_object__init_extern_map(obj); if (err) return err; @@ -1532,11 +1834,6 @@ static int bpf_object__init_btf(struct bpf_object *obj, BTF_ELF_SEC, err); goto out; } - err = btf__finalize_data(obj, obj->btf); - if (err) { - pr_warn("Error finalizing %s: %d.\n", BTF_ELF_SEC, err); - goto out; - } } if (btf_ext_data) { if (!obj->btf) { @@ -1570,6 +1867,30 @@ out: return 0; } +static int bpf_object__finalize_btf(struct bpf_object *obj) +{ + int err; + + if (!obj->btf) + return 0; + + err = btf__finalize_data(obj, obj->btf); + if (!err) + return 0; + + pr_warn("Error finalizing %s: %d.\n", BTF_ELF_SEC, err); + btf__free(obj->btf); + obj->btf = NULL; + btf_ext__free(obj->btf_ext); + obj->btf_ext = NULL; + + if (bpf_object__is_btf_mandatory(obj)) { + pr_warn("BTF is required, but is missing or corrupted.\n"); + return -ENOENT; + } + return 0; +} + static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) { int err = 0; @@ -1670,6 +1991,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj) return -LIBBPF_ERRNO__FORMAT; } obj->efile.symbols = data; + obj->efile.symbols_shndx = idx; obj->efile.strtabidx = sh.sh_link; } else if (sh.sh_type == SHT_PROGBITS && data->d_size > 0) { if (sh.sh_flags & SHF_EXECINSTR) { @@ -1737,6 +2059,216 @@ static int bpf_object__elf_collect(struct bpf_object *obj) return bpf_object__init_btf(obj, btf_data, btf_ext_data); } +static bool sym_is_extern(const GElf_Sym *sym) +{ + int bind = GELF_ST_BIND(sym->st_info); + /* externs are symbols w/ type=NOTYPE, bind=GLOBAL|WEAK, section=UND */ + return sym->st_shndx == SHN_UNDEF && + (bind == STB_GLOBAL || bind == STB_WEAK) && + GELF_ST_TYPE(sym->st_info) == STT_NOTYPE; +} + +static int find_extern_btf_id(const struct btf *btf, const char *ext_name) +{ + const struct btf_type *t; + const char *var_name; + int i, n; + + if (!btf) + return -ESRCH; + + n = btf__get_nr_types(btf); + for (i = 1; i <= n; i++) { + t = btf__type_by_id(btf, i); + + if (!btf_is_var(t)) + continue; + + var_name = btf__name_by_offset(btf, t->name_off); + if (strcmp(var_name, ext_name)) + continue; + + if (btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) + return -EINVAL; + + return i; + } + + return -ENOENT; +} + +static enum extern_type find_extern_type(const struct btf *btf, int id, + bool *is_signed) +{ + const struct btf_type *t; + const char *name; + + t = skip_mods_and_typedefs(btf, id, NULL); + name = btf__name_by_offset(btf, t->name_off); + + if (is_signed) + *is_signed = false; + switch (btf_kind(t)) { + case BTF_KIND_INT: { + int enc = btf_int_encoding(t); + + if (enc & BTF_INT_BOOL) + return t->size == 1 ? EXT_BOOL : EXT_UNKNOWN; + if (is_signed) + *is_signed = enc & BTF_INT_SIGNED; + if (t->size == 1) + return EXT_CHAR; + if (t->size < 1 || t->size > 8 || (t->size & (t->size - 1))) + return EXT_UNKNOWN; + return EXT_INT; + } + case BTF_KIND_ENUM: + if (t->size != 4) + return EXT_UNKNOWN; + if (strcmp(name, "libbpf_tristate")) + return EXT_UNKNOWN; + return EXT_TRISTATE; + case BTF_KIND_ARRAY: + if (btf_array(t)->nelems == 0) + return EXT_UNKNOWN; + if (find_extern_type(btf, btf_array(t)->type, NULL) != EXT_CHAR) + return EXT_UNKNOWN; + return EXT_CHAR_ARR; + default: + return EXT_UNKNOWN; + } +} + +static int cmp_externs(const void *_a, const void *_b) +{ + const struct extern_desc *a = _a; + const struct extern_desc *b = _b; + + /* descending order by alignment requirements */ + if (a->align != b->align) + return a->align > b->align ? -1 : 1; + /* ascending order by size, within same alignment class */ + if (a->sz != b->sz) + return a->sz < b->sz ? -1 : 1; + /* resolve ties by name */ + return strcmp(a->name, b->name); +} + +static int bpf_object__collect_externs(struct bpf_object *obj) +{ + const struct btf_type *t; + struct extern_desc *ext; + int i, n, off, btf_id; + struct btf_type *sec; + const char *ext_name; + Elf_Scn *scn; + GElf_Shdr sh; + + if (!obj->efile.symbols) + return 0; + + scn = elf_getscn(obj->efile.elf, obj->efile.symbols_shndx); + if (!scn) + return -LIBBPF_ERRNO__FORMAT; + if (gelf_getshdr(scn, &sh) != &sh) + return -LIBBPF_ERRNO__FORMAT; + n = sh.sh_size / sh.sh_entsize; + + pr_debug("looking for externs among %d symbols...\n", n); + for (i = 0; i < n; i++) { + GElf_Sym sym; + + if (!gelf_getsym(obj->efile.symbols, i, &sym)) + return -LIBBPF_ERRNO__FORMAT; + if (!sym_is_extern(&sym)) + continue; + ext_name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, + sym.st_name); + if (!ext_name || !ext_name[0]) + continue; + + ext = obj->externs; + ext = reallocarray(ext, obj->nr_extern + 1, sizeof(*ext)); + if (!ext) + return -ENOMEM; + obj->externs = ext; + ext = &ext[obj->nr_extern]; + memset(ext, 0, sizeof(*ext)); + obj->nr_extern++; + + ext->btf_id = find_extern_btf_id(obj->btf, ext_name); + if (ext->btf_id <= 0) { + pr_warn("failed to find BTF for extern '%s': %d\n", + ext_name, ext->btf_id); + return ext->btf_id; + } + t = btf__type_by_id(obj->btf, ext->btf_id); + ext->name = btf__name_by_offset(obj->btf, t->name_off); + ext->sym_idx = i; + ext->is_weak = GELF_ST_BIND(sym.st_info) == STB_WEAK; + ext->sz = btf__resolve_size(obj->btf, t->type); + if (ext->sz <= 0) { + pr_warn("failed to resolve size of extern '%s': %d\n", + ext_name, ext->sz); + return ext->sz; + } + ext->align = btf__align_of(obj->btf, t->type); + if (ext->align <= 0) { + pr_warn("failed to determine alignment of extern '%s': %d\n", + ext_name, ext->align); + return -EINVAL; + } + ext->type = find_extern_type(obj->btf, t->type, + &ext->is_signed); + if (ext->type == EXT_UNKNOWN) { + pr_warn("extern '%s' type is unsupported\n", ext_name); + return -ENOTSUP; + } + } + pr_debug("collected %d externs total\n", obj->nr_extern); + + if (!obj->nr_extern) + return 0; + + /* sort externs by (alignment, size, name) and calculate their offsets + * within a map */ + qsort(obj->externs, obj->nr_extern, sizeof(*ext), cmp_externs); + off = 0; + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + ext->data_off = roundup(off, ext->align); + off = ext->data_off + ext->sz; + pr_debug("extern #%d: symbol %d, off %u, name %s\n", + i, ext->sym_idx, ext->data_off, ext->name); + } + + btf_id = btf__find_by_name(obj->btf, EXTERN_SEC); + if (btf_id <= 0) { + pr_warn("no BTF info found for '%s' datasec\n", EXTERN_SEC); + return -ESRCH; + } + + sec = (struct btf_type *)btf__type_by_id(obj->btf, btf_id); + sec->size = off; + n = btf_vlen(sec); + for (i = 0; i < n; i++) { + struct btf_var_secinfo *vs = btf_var_secinfos(sec) + i; + + t = btf__type_by_id(obj->btf, vs->type); + ext_name = btf__name_by_offset(obj->btf, t->name_off); + ext = find_extern_by_name(obj, ext_name); + if (!ext) { + pr_warn("failed to find extern definition for BTF var '%s'\n", + ext_name); + return -ESRCH; + } + vs->offset = ext->data_off; + btf_var(t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + } + + return 0; +} + static struct bpf_program * bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx) { @@ -1801,6 +2333,8 @@ bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) return LIBBPF_MAP_BSS; else if (shndx == obj->efile.rodata_shndx) return LIBBPF_MAP_RODATA; + else if (shndx == obj->efile.symbols_shndx) + return LIBBPF_MAP_EXTERN; else return LIBBPF_MAP_UNSPEC; } @@ -1845,6 +2379,30 @@ static int bpf_program__record_reloc(struct bpf_program *prog, insn_idx, insn->code); return -LIBBPF_ERRNO__RELOC; } + + if (sym_is_extern(sym)) { + int sym_idx = GELF_R_SYM(rel->r_info); + int i, n = obj->nr_extern; + struct extern_desc *ext; + + for (i = 0; i < n; i++) { + ext = &obj->externs[i]; + if (ext->sym_idx == sym_idx) + break; + } + if (i >= n) { + pr_warn("extern relo failed to find extern for sym %d\n", + sym_idx); + return -LIBBPF_ERRNO__RELOC; + } + pr_debug("found extern #%d '%s' (sym %d, off %u) for insn %u\n", + i, ext->name, ext->sym_idx, ext->data_off, insn_idx); + reloc_desc->type = RELO_EXTERN; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = ext->data_off; + return 0; + } + if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { pr_warn("invalid relo for \'%s\' in special section 0x%x; forgot to initialize global var?..\n", name, shdr_idx); @@ -2306,11 +2864,12 @@ bpf_object__reuse_map(struct bpf_map *map) static int bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) { + enum libbpf_map_type map_type = map->libbpf_type; char *cp, errmsg[STRERR_BUFSIZE]; int err, zero = 0; - /* Nothing to do here since kernel already zero-initializes .bss map. */ - if (map->libbpf_type == LIBBPF_MAP_BSS) + /* kernel already zero-initializes .bss map. */ + if (map_type == LIBBPF_MAP_BSS) return 0; err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0); @@ -2322,8 +2881,8 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) return err; } - /* Freeze .rodata map as read-only from syscall side. */ - if (map->libbpf_type == LIBBPF_MAP_RODATA) { + /* Freeze .rodata and .extern map as read-only from syscall side. */ + if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_EXTERN) { err = bpf_map_freeze(map->fd); if (err) { err = -errno; @@ -3572,9 +4131,6 @@ bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj, size_t new_cnt; int err; - if (relo->type != RELO_CALL) - return -LIBBPF_ERRNO__RELOC; - if (prog->idx == obj->efile.text_shndx) { pr_warn("relo in .text insn %d into off %d (insn #%d)\n", relo->insn_idx, relo->sym_off, relo->sym_off / 8); @@ -3636,27 +4192,37 @@ bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj) for (i = 0; i < prog->nr_reloc; i++) { struct reloc_desc *relo = &prog->reloc_desc[i]; + struct bpf_insn *insn = &prog->insns[relo->insn_idx]; - if (relo->type == RELO_LD64 || relo->type == RELO_DATA) { - struct bpf_insn *insn = &prog->insns[relo->insn_idx]; + if (relo->insn_idx + 1 >= (int)prog->insns_cnt) { + pr_warn("relocation out of range: '%s'\n", + prog->section_name); + return -LIBBPF_ERRNO__RELOC; + } - if (relo->insn_idx + 1 >= (int)prog->insns_cnt) { - pr_warn("relocation out of range: '%s'\n", - prog->section_name); - return -LIBBPF_ERRNO__RELOC; - } - - if (relo->type != RELO_DATA) { - insn[0].src_reg = BPF_PSEUDO_MAP_FD; - } else { - insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; - insn[1].imm = insn[0].imm + relo->sym_off; - } + switch (relo->type) { + case RELO_LD64: + insn[0].src_reg = BPF_PSEUDO_MAP_FD; insn[0].imm = obj->maps[relo->map_idx].fd; - } else if (relo->type == RELO_CALL) { + break; + case RELO_DATA: + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn[1].imm = insn[0].imm + relo->sym_off; + insn[0].imm = obj->maps[relo->map_idx].fd; + break; + case RELO_EXTERN: + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn[0].imm = obj->maps[obj->extern_map_idx].fd; + insn[1].imm = relo->sym_off; + break; + case RELO_CALL: err = bpf_program__reloc_text(prog, obj, relo); if (err) return err; + break; + default: + pr_warn("relo #%d: bad relo type %d\n", i, relo->type); + return -EINVAL; } } @@ -3936,11 +4502,10 @@ static struct bpf_object * __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) { + const char *obj_name, *kconfig_path; struct bpf_program *prog; struct bpf_object *obj; - const char *obj_name; char tmp_name[64]; - __u32 attach_prog_fd; int err; if (elf_version(EV_CURRENT) == EV_NONE) { @@ -3969,11 +4534,18 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, return obj; obj->relaxed_core_relocs = OPTS_GET(opts, relaxed_core_relocs, false); - attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0); + kconfig_path = OPTS_GET(opts, kconfig_path, NULL); + if (kconfig_path) { + obj->kconfig_path = strdup(kconfig_path); + if (!obj->kconfig_path) + return ERR_PTR(-ENOMEM); + } err = bpf_object__elf_init(obj); err = err ? : bpf_object__check_endianness(obj); err = err ? : bpf_object__elf_collect(obj); + err = err ? : bpf_object__collect_externs(obj); + err = err ? : bpf_object__finalize_btf(obj); err = err ? : bpf_object__init_maps(obj, opts); err = err ? : bpf_object__init_prog_names(obj); err = err ? : bpf_object__collect_reloc(obj); @@ -3996,7 +4568,7 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, bpf_program__set_type(prog, prog_type); bpf_program__set_expected_attach_type(prog, attach_type); if (prog_type == BPF_PROG_TYPE_TRACING) - prog->attach_prog_fd = attach_prog_fd; + prog->attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0); } return obj; @@ -4107,6 +4679,61 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) return 0; } +static int bpf_object__resolve_externs(struct bpf_object *obj, + const char *config_path) +{ + bool need_config = false; + struct extern_desc *ext; + int err, i; + void *data; + + if (obj->nr_extern == 0) + return 0; + + data = obj->maps[obj->extern_map_idx].mmaped; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + + if (strcmp(ext->name, "LINUX_KERNEL_VERSION") == 0) { + void *ext_val = data + ext->data_off; + __u32 kver = get_kernel_version(); + + if (!kver) { + pr_warn("failed to get kernel version\n"); + return -EINVAL; + } + err = set_ext_value_num(ext, ext_val, kver); + if (err) + return err; + pr_debug("extern %s=0x%x\n", ext->name, kver); + } else if (strncmp(ext->name, "CONFIG_", 7) == 0) { + need_config = true; + } else { + pr_warn("unrecognized extern '%s'\n", ext->name); + return -EINVAL; + } + } + if (need_config) { + err = bpf_object__read_kernel_config(obj, config_path, data); + if (err) + return -EINVAL; + } + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + + if (!ext->is_set && !ext->is_weak) { + pr_warn("extern %s (strong) not resolved\n", ext->name); + return -ESRCH; + } else if (!ext->is_set) { + pr_debug("extern %s (weak) not resolved, defaulting to zero\n", + ext->name); + } + } + + return 0; +} + int bpf_object__load_xattr(struct bpf_object_load_attr *attr) { struct bpf_object *obj; @@ -4126,6 +4753,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) obj->loaded = true; err = bpf_object__probe_caps(obj); + err = err ? : bpf_object__resolve_externs(obj, obj->kconfig_path); err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_maps(obj); err = err ? : bpf_object__create_maps(obj); @@ -4719,6 +5347,10 @@ void bpf_object__close(struct bpf_object *obj) zfree(&map->pin_path); } + zfree(&obj->kconfig_path); + zfree(&obj->externs); + obj->nr_extern = 0; + zfree(&obj->maps); obj->nr_maps = 0; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 623191e71415..6340823871e2 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -85,8 +85,12 @@ struct bpf_object_open_opts { */ const char *pin_root_path; __u32 attach_prog_fd; + /* kernel config file path override (for CONFIG_ externs); can point + * to either uncompressed text file or .gz file + */ + const char *kconfig_path; }; -#define bpf_object_open_opts__last_field attach_prog_fd +#define bpf_object_open_opts__last_field kconfig_path LIBBPF_API struct bpf_object *bpf_object__open(const char *path); LIBBPF_API struct bpf_object * @@ -669,6 +673,12 @@ LIBBPF_API int bpf_object__attach_skeleton(struct bpf_object_skeleton *s); LIBBPF_API void bpf_object__detach_skeleton(struct bpf_object_skeleton *s); LIBBPF_API void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s); +enum libbpf_tristate { + TRI_NO = 0, + TRI_YES = 1, + TRI_MODULE = 2, +}; + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f70c8e735120..ba90621617a8 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -24,7 +24,7 @@ CFLAGS += -g -Wall -O2 $(GENFLAGS) -I$(APIDIR) -I$(LIBDIR) -I$(BPFDIR) \ -I$(GENDIR) -I$(TOOLSINCDIR) -I$(CURDIR) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program -LDLIBS += -lcap -lelf -lrt -lpthread +LDLIBS += -lcap -lelf -lz -lrt -lpthread # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ From 2ad97d473db57ab866f0756806bb94515f7f2551 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:47:09 -0800 Subject: [PATCH 052/127] bpftool: Generate externs datasec in BPF skeleton Add support for generation of mmap()-ed read-only view of libbpf-provided extern variables. As externs are not supposed to be provided by user code (that's what .data, .bss, and .rodata is for), don't mmap() it initially. Only after skeleton load is performed, map .extern contents as read-only memory. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191214014710.3449601-4-andriin@fb.com --- tools/bpf/bpftool/gen.c | 4 ++++ tools/lib/bpf/libbpf.c | 10 +++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 7379dae35dca..a07c80429c7a 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -82,6 +82,8 @@ static const char *get_map_ident(const struct bpf_map *map) return "rodata"; else if (str_has_suffix(name, ".bss")) return "bss"; + else if (str_has_suffix(name, ".extern")) + return "externs"; /* extern is a C keyword */ else return NULL; } @@ -109,6 +111,8 @@ static int codegen_datasec_def(struct bpf_object *obj, sec_ident = "bss"; else if (strcmp(sec_name, ".rodata") == 0) sec_ident = "rodata"; + else if (strcmp(sec_name, ".extern") == 0) + sec_ident = "externs"; /* extern is a C keyword */ else return 0; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 26144706ba0b..c9f90bfd2f0d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7472,7 +7472,8 @@ int bpf_object__open_skeleton(struct bpf_object_skeleton *s, return -ESRCH; } - if (mmaped) + /* externs shouldn't be pre-setup from user code */ + if (mmaped && (*map)->libbpf_type != LIBBPF_MAP_EXTERN) *mmaped = (*map)->mmaped; } @@ -7505,7 +7506,6 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) size_t mmap_sz = bpf_map_mmap_sz(map); int prot, map_fd = bpf_map__fd(map); void **mmaped = s->maps[i].mmaped; - void *remapped; if (!mmaped) continue; @@ -7530,9 +7530,9 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) * as per normal clean up procedure, so we don't need to worry * about it from skeleton's clean up perspective. */ - remapped = mmap(*mmaped, mmap_sz, prot, MAP_SHARED | MAP_FIXED, - map_fd, 0); - if (remapped == MAP_FAILED) { + *mmaped = mmap(map->mmaped, mmap_sz, prot, + MAP_SHARED | MAP_FIXED, map_fd, 0); + if (*mmaped == MAP_FAILED) { err = -errno; *mmaped = NULL; pr_warn("failed to re-mmap() map '%s': %d\n", From 330a73a7b6ca93a415de1b7da68d7a0698fe4937 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 13 Dec 2019 17:47:10 -0800 Subject: [PATCH 053/127] selftests/bpf: Add tests for libbpf-provided externs Add a set of tests validating libbpf-provided extern variables. One crucial feature that's tested is dead code elimination together with using invalid BPF helper. CONFIG_MISSING is not supposed to exist and should always be specified by libbpf as zero, which allows BPF verifier to correctly do branch pruning and not fail validation, when invalid BPF helper is called from dead if branch. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191214014710.3449601-5-andriin@fb.com --- .../selftests/bpf/prog_tests/core_extern.c | 195 ++++++++++++++++++ .../selftests/bpf/prog_tests/skeleton.c | 18 +- .../selftests/bpf/progs/test_core_extern.c | 62 ++++++ .../selftests/bpf/progs/test_skeleton.c | 9 + 4 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/core_extern.c create mode 100644 tools/testing/selftests/bpf/progs/test_core_extern.c diff --git a/tools/testing/selftests/bpf/prog_tests/core_extern.c b/tools/testing/selftests/bpf/prog_tests/core_extern.c new file mode 100644 index 000000000000..30a7972e9012 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/core_extern.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include +#include +#include +#include +#include "test_core_extern.skel.h" + +static uint32_t get_kernel_version(void) +{ + uint32_t major, minor, patch; + struct utsname info; + + uname(&info); + if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3) + return 0; + return KERNEL_VERSION(major, minor, patch); +} + +#define CFG "CONFIG_BPF_SYSCALL=n\n" + +static struct test_case { + const char *name; + const char *cfg; + const char *cfg_path; + bool fails; + struct test_core_extern__data data; +} test_cases[] = { + { .name = "default search path", .cfg_path = NULL, + .data = { .bpf_syscall = true } }, + { .name = "/proc/config.gz", .cfg_path = "/proc/config.gz", + .data = { .bpf_syscall = true } }, + { .name = "missing config", .fails = true, + .cfg_path = "/proc/invalid-config.gz" }, + { + .name = "custom values", + .cfg = "CONFIG_BPF_SYSCALL=y\n" + "CONFIG_TRISTATE=m\n" + "CONFIG_BOOL=y\n" + "CONFIG_CHAR=100\n" + "CONFIG_USHORT=30000\n" + "CONFIG_INT=123456\n" + "CONFIG_ULONG=0xDEADBEEFC0DE\n" + "CONFIG_STR=\"abracad\"\n" + "CONFIG_MISSING=0", + .data = { + .bpf_syscall = true, + .tristate_val = TRI_MODULE, + .bool_val = true, + .char_val = 100, + .ushort_val = 30000, + .int_val = 123456, + .ulong_val = 0xDEADBEEFC0DE, + .str_val = "abracad", + }, + }, + /* TRISTATE */ + { .name = "tristate (y)", .cfg = CFG"CONFIG_TRISTATE=y\n", + .data = { .tristate_val = TRI_YES } }, + { .name = "tristate (n)", .cfg = CFG"CONFIG_TRISTATE=n\n", + .data = { .tristate_val = TRI_NO } }, + { .name = "tristate (m)", .cfg = CFG"CONFIG_TRISTATE=m\n", + .data = { .tristate_val = TRI_MODULE } }, + { .name = "tristate (int)", .fails = 1, .cfg = CFG"CONFIG_TRISTATE=1" }, + { .name = "tristate (bad)", .fails = 1, .cfg = CFG"CONFIG_TRISTATE=M" }, + /* BOOL */ + { .name = "bool (y)", .cfg = CFG"CONFIG_BOOL=y\n", + .data = { .bool_val = true } }, + { .name = "bool (n)", .cfg = CFG"CONFIG_BOOL=n\n", + .data = { .bool_val = false } }, + { .name = "bool (tristate)", .fails = 1, .cfg = CFG"CONFIG_BOOL=m" }, + { .name = "bool (int)", .fails = 1, .cfg = CFG"CONFIG_BOOL=1" }, + /* CHAR */ + { .name = "char (tristate)", .cfg = CFG"CONFIG_CHAR=m\n", + .data = { .char_val = 'm' } }, + { .name = "char (bad)", .fails = 1, .cfg = CFG"CONFIG_CHAR=q\n" }, + { .name = "char (empty)", .fails = 1, .cfg = CFG"CONFIG_CHAR=\n" }, + { .name = "char (str)", .fails = 1, .cfg = CFG"CONFIG_CHAR=\"y\"\n" }, + /* STRING */ + { .name = "str (empty)", .cfg = CFG"CONFIG_STR=\"\"\n", + .data = { .str_val = "\0\0\0\0\0\0\0" } }, + { .name = "str (padded)", .cfg = CFG"CONFIG_STR=\"abra\"\n", + .data = { .str_val = "abra\0\0\0" } }, + { .name = "str (too long)", .cfg = CFG"CONFIG_STR=\"abracada\"\n", + .data = { .str_val = "abracad" } }, + { .name = "str (no value)", .fails = 1, .cfg = CFG"CONFIG_STR=\n" }, + { .name = "str (bad value)", .fails = 1, .cfg = CFG"CONFIG_STR=bla\n" }, + /* INTEGERS */ + { + .name = "integer forms", + .cfg = CFG + "CONFIG_CHAR=0xA\n" + "CONFIG_USHORT=0462\n" + "CONFIG_INT=-100\n" + "CONFIG_ULONG=+1000000000000", + .data = { + .char_val = 0xA, + .ushort_val = 0462, + .int_val = -100, + .ulong_val = 1000000000000, + }, + }, + { .name = "int (bad)", .fails = 1, .cfg = CFG"CONFIG_INT=abc" }, + { .name = "int (str)", .fails = 1, .cfg = CFG"CONFIG_INT=\"abc\"" }, + { .name = "int (empty)", .fails = 1, .cfg = CFG"CONFIG_INT=" }, + { .name = "int (mixed)", .fails = 1, .cfg = CFG"CONFIG_INT=123abc" }, + { .name = "int (max)", .cfg = CFG"CONFIG_INT=2147483647", + .data = { .int_val = 2147483647 } }, + { .name = "int (min)", .cfg = CFG"CONFIG_INT=-2147483648", + .data = { .int_val = -2147483648 } }, + { .name = "int (max+1)", .fails = 1, .cfg = CFG"CONFIG_INT=2147483648" }, + { .name = "int (min-1)", .fails = 1, .cfg = CFG"CONFIG_INT=-2147483649" }, + { .name = "ushort (max)", .cfg = CFG"CONFIG_USHORT=65535", + .data = { .ushort_val = 65535 } }, + { .name = "ushort (min)", .cfg = CFG"CONFIG_USHORT=0", + .data = { .ushort_val = 0 } }, + { .name = "ushort (max+1)", .fails = 1, .cfg = CFG"CONFIG_USHORT=65536" }, + { .name = "ushort (min-1)", .fails = 1, .cfg = CFG"CONFIG_USHORT=-1" }, + { .name = "u64 (max)", .cfg = CFG"CONFIG_ULONG=0xffffffffffffffff", + .data = { .ulong_val = 0xffffffffffffffff } }, + { .name = "u64 (min)", .cfg = CFG"CONFIG_ULONG=0", + .data = { .ulong_val = 0 } }, + { .name = "u64 (max+1)", .fails = 1, .cfg = CFG"CONFIG_ULONG=0x10000000000000000" }, +}; + +BPF_EMBED_OBJ(core_extern, "test_core_extern.o"); + +void test_core_extern(void) +{ + const uint32_t kern_ver = get_kernel_version(); + int err, duration = 0, i, j; + struct test_core_extern *skel = NULL; + uint64_t *got, *exp; + int n = sizeof(*skel->data) / sizeof(uint64_t); + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { + char tmp_cfg_path[] = "/tmp/test_core_extern_cfg.XXXXXX"; + struct test_case *t = &test_cases[i]; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, + .kconfig_path = t->cfg_path, + ); + + if (!test__start_subtest(t->name)) + continue; + + if (t->cfg) { + size_t n = strlen(t->cfg) + 1; + int fd = mkstemp(tmp_cfg_path); + int written; + + if (CHECK(fd < 0, "mkstemp", "errno: %d\n", errno)) + continue; + printf("using '%s' as config file\n", tmp_cfg_path); + written = write(fd, t->cfg, n); + close(fd); + if (CHECK_FAIL(written != n)) + goto cleanup; + opts.kconfig_path = tmp_cfg_path; + } + + skel = test_core_extern__open_opts(&core_extern_embed, &opts); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) + goto cleanup; + err = test_core_extern__load(skel); + if (t->fails) { + CHECK(!err, "skel_load", + "shouldn't succeed open/load of skeleton\n"); + goto cleanup; + } else if (CHECK(err, "skel_load", + "failed to open/load skeleton\n")) { + goto cleanup; + } + err = test_core_extern__attach(skel); + if (CHECK(err, "attach_raw_tp", "failed attach: %d\n", err)) + goto cleanup; + + usleep(1); + + t->data.kern_ver = kern_ver; + t->data.missing_val = 0xDEADC0DE; + got = (uint64_t *)skel->data; + exp = (uint64_t *)&t->data; + for (j = 0; j < n; j++) { + CHECK(got[j] != exp[j], "check_res", + "result #%d: expected %lx, but got %lx\n", + j, exp[j], got[j]); + } +cleanup: + if (t->cfg) + unlink(tmp_cfg_path); + test_core_extern__destroy(skel); + skel = NULL; + } +} diff --git a/tools/testing/selftests/bpf/prog_tests/skeleton.c b/tools/testing/selftests/bpf/prog_tests/skeleton.c index 79f8d13e6740..151cdad3ad0d 100644 --- a/tools/testing/selftests/bpf/prog_tests/skeleton.c +++ b/tools/testing/selftests/bpf/prog_tests/skeleton.c @@ -17,11 +17,21 @@ void test_skeleton(void) int duration = 0, err; struct test_skeleton* skel; struct test_skeleton__bss *bss; + struct test_skeleton__externs *exts; - skel = test_skeleton__open_and_load(&skeleton_embed); + skel = test_skeleton__open(&skeleton_embed); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) return; + printf("EXTERNS BEFORE: %p\n", skel->externs); + if (CHECK(skel->externs, "skel_externs", "externs are mmaped()!\n")) + goto cleanup; + + err = test_skeleton__load(skel); + if (CHECK(err, "skel_load", "failed to load skeleton: %d\n", err)) + goto cleanup; + printf("EXTERNS AFTER: %p\n", skel->externs); + bss = skel->bss; bss->in1 = 1; bss->in2 = 2; @@ -29,6 +39,7 @@ void test_skeleton(void) bss->in4 = 4; bss->in5.a = 5; bss->in5.b = 6; + exts = skel->externs; err = test_skeleton__attach(skel); if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) @@ -46,6 +57,11 @@ void test_skeleton(void) CHECK(bss->handler_out5.b != 6, "res6", "got %lld != exp %d\n", bss->handler_out5.b, 6); + CHECK(bss->bpf_syscall != exts->CONFIG_BPF_SYSCALL, "ext1", + "got %d != exp %d\n", bss->bpf_syscall, exts->CONFIG_BPF_SYSCALL); + CHECK(bss->kern_ver != exts->LINUX_KERNEL_VERSION, "ext2", + "got %d != exp %d\n", bss->kern_ver, exts->LINUX_KERNEL_VERSION); + cleanup: test_skeleton__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_core_extern.c b/tools/testing/selftests/bpf/progs/test_core_extern.c new file mode 100644 index 000000000000..e12f09f9e881 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_core_extern.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include +#include +#include +#include +#include "bpf_helpers.h" + +/* non-existing BPF helper, to test dead code elimination */ +static int (*bpf_missing_helper)(const void *arg1, int arg2) = (void *) 999; + +extern int LINUX_KERNEL_VERSION; +extern bool CONFIG_BPF_SYSCALL; /* strong */ +extern enum libbpf_tristate CONFIG_TRISTATE __weak; +extern bool CONFIG_BOOL __weak; +extern char CONFIG_CHAR __weak; +extern uint16_t CONFIG_USHORT __weak; +extern int CONFIG_INT __weak; +extern uint64_t CONFIG_ULONG __weak; +extern const char CONFIG_STR[8] __weak; +extern uint64_t CONFIG_MISSING __weak; + +uint64_t kern_ver = -1; +uint64_t bpf_syscall = -1; +uint64_t tristate_val = -1; +uint64_t bool_val = -1; +uint64_t char_val = -1; +uint64_t ushort_val = -1; +uint64_t int_val = -1; +uint64_t ulong_val = -1; +char str_val[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; +uint64_t missing_val = -1; + +SEC("raw_tp/sys_enter") +int handle_sys_enter(struct pt_regs *ctx) +{ + int i; + + kern_ver = LINUX_KERNEL_VERSION; + bpf_syscall = CONFIG_BPF_SYSCALL; + tristate_val = CONFIG_TRISTATE; + bool_val = CONFIG_BOOL; + char_val = CONFIG_CHAR; + ushort_val = CONFIG_USHORT; + int_val = CONFIG_INT; + ulong_val = CONFIG_ULONG; + + for (i = 0; i < sizeof(CONFIG_STR); i++) { + str_val[i] = CONFIG_STR[i]; + } + + if (CONFIG_MISSING) + /* invalid, but dead code - never executed */ + missing_val = bpf_missing_helper(ctx, 123); + else + missing_val = 0xDEADC0DE; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_skeleton.c b/tools/testing/selftests/bpf/progs/test_skeleton.c index c0013d2e16f2..9caa44758ea2 100644 --- a/tools/testing/selftests/bpf/progs/test_skeleton.c +++ b/tools/testing/selftests/bpf/progs/test_skeleton.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ +#include #include #include "bpf_helpers.h" @@ -20,6 +21,10 @@ char out3 = 0; long long out4 = 0; int out1 = 0; +extern bool CONFIG_BPF_SYSCALL; +extern int LINUX_KERNEL_VERSION; +bool bpf_syscall = 0; +int kern_ver = 0; SEC("raw_tp/sys_enter") int handler(const void *ctx) @@ -31,6 +36,10 @@ int handler(const void *ctx) out3 = in3; out4 = in4; out5 = in5; + + bpf_syscall = CONFIG_BPF_SYSCALL; + kern_ver = LINUX_KERNEL_VERSION; + return 0; } From 1b484b301cec404e8a329f05edf848fc0a5875c0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 14 Dec 2019 23:08:43 -0800 Subject: [PATCH 054/127] libbpf: Support flexible arrays in CO-RE Some data stuctures in kernel are defined with either zero-sized array or flexible (dimensionless) array at the end of a struct. Actual data of such array follows in memory immediately after the end of that struct, forming its variable-sized "body" of elements. Support such access pattern in CO-RE relocation handling. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191215070844.1014385-2-andriin@fb.com --- tools/lib/bpf/libbpf.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c9f90bfd2f0d..a2cc7313763a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3108,6 +3108,21 @@ static bool str_is_empty(const char *s) return !s || !s[0]; } +static bool is_flex_arr(const struct btf *btf, + const struct bpf_core_accessor *acc, + const struct btf_array *arr) +{ + const struct btf_type *t; + + /* not a flexible array, if not inside a struct or has non-zero size */ + if (!acc->name || arr->nelems > 0) + return false; + + /* has to be the last member of enclosing struct */ + t = btf__type_by_id(btf, acc->type_id); + return acc->idx == btf_vlen(t) - 1; +} + /* * Turn bpf_field_reloc into a low- and high-level spec representation, * validating correctness along the way, as well as calculating resulting @@ -3145,6 +3160,7 @@ static int bpf_core_spec_parse(const struct btf *btf, struct bpf_core_spec *spec) { int access_idx, parsed_len, i; + struct bpf_core_accessor *acc; const struct btf_type *t; const char *name; __u32 id; @@ -3192,6 +3208,7 @@ static int bpf_core_spec_parse(const struct btf *btf, return -EINVAL; access_idx = spec->raw_spec[i]; + acc = &spec->spec[spec->len]; if (btf_is_composite(t)) { const struct btf_member *m; @@ -3209,18 +3226,23 @@ static int bpf_core_spec_parse(const struct btf *btf, if (str_is_empty(name)) return -EINVAL; - spec->spec[spec->len].type_id = id; - spec->spec[spec->len].idx = access_idx; - spec->spec[spec->len].name = name; + acc->type_id = id; + acc->idx = access_idx; + acc->name = name; spec->len++; } id = m->type; } else if (btf_is_array(t)) { const struct btf_array *a = btf_array(t); + bool flex; t = skip_mods_and_typedefs(btf, a->type, &id); - if (!t || access_idx >= a->nelems) + if (!t) + return -EINVAL; + + flex = is_flex_arr(btf, acc - 1, a); + if (!flex && access_idx >= a->nelems) return -EINVAL; spec->spec[spec->len].type_id = id; @@ -3525,12 +3547,14 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, */ if (i > 0) { const struct btf_array *a; + bool flex; if (!btf_is_array(targ_type)) return 0; a = btf_array(targ_type); - if (local_acc->idx >= a->nelems) + flex = is_flex_arr(targ_btf, targ_acc - 1, a); + if (!flex && local_acc->idx >= a->nelems) return 0; if (!skip_mods_and_typedefs(targ_btf, a->type, &targ_id)) From 5f2eeceffb92a0d799b141df7af3d1ac77337dc4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 14 Dec 2019 23:08:44 -0800 Subject: [PATCH 055/127] selftests/bpf: Add flexible array relocation tests Add few tests validation CO-RE relocation handling of flexible array accesses. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191215070844.1014385-3-andriin@fb.com --- .../selftests/bpf/prog_tests/core_reloc.c | 4 ++ ...f__core_reloc_arrays___equiv_zero_sz_arr.c | 3 ++ ..._core_reloc_arrays___err_bad_zero_sz_arr.c | 3 ++ .../btf__core_reloc_arrays___fixed_arr.c | 3 ++ .../selftests/bpf/progs/core_reloc_types.h | 39 +++++++++++++++++++ .../bpf/progs/test_core_reloc_arrays.c | 8 ++-- 6 files changed, 56 insertions(+), 4 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 05fe85281ff7..31e177adbdf1 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -74,6 +74,7 @@ .b123 = 2, \ .c1c = 3, \ .d00d = 4, \ + .f10c = 0, \ }, \ .output_len = sizeof(struct core_reloc_arrays_output) \ } @@ -308,12 +309,15 @@ static struct core_reloc_test_case test_cases[] = { ARRAYS_CASE(arrays), ARRAYS_CASE(arrays___diff_arr_dim), ARRAYS_CASE(arrays___diff_arr_val_sz), + ARRAYS_CASE(arrays___equiv_zero_sz_arr), + ARRAYS_CASE(arrays___fixed_arr), ARRAYS_ERR_CASE(arrays___err_too_small), ARRAYS_ERR_CASE(arrays___err_too_shallow), ARRAYS_ERR_CASE(arrays___err_non_array), ARRAYS_ERR_CASE(arrays___err_wrong_val_type1), ARRAYS_ERR_CASE(arrays___err_wrong_val_type2), + ARRAYS_ERR_CASE(arrays___err_bad_zero_sz_arr), /* enum/ptr/int handling scenarios */ PRIMITIVES_CASE(primitives), diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c new file mode 100644 index 000000000000..65eac371b061 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_arrays___equiv_zero_sz_arr x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c new file mode 100644 index 000000000000..ecda2b545ac2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_arrays___err_bad_zero_sz_arr x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c new file mode 100644 index 000000000000..fe1d01232c22 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_arrays___fixed_arr x) {} diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 9311489e14b2..6d598cfbdb3e 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -327,6 +327,7 @@ struct core_reloc_arrays_output { char b123; int c1c; int d00d; + int f10c; }; struct core_reloc_arrays_substruct { @@ -339,6 +340,7 @@ struct core_reloc_arrays { char b[2][3][4]; struct core_reloc_arrays_substruct c[3]; struct core_reloc_arrays_substruct d[1][2]; + struct core_reloc_arrays_substruct f[][2]; }; /* bigger array dimensions */ @@ -347,6 +349,7 @@ struct core_reloc_arrays___diff_arr_dim { char b[3][4][5]; struct core_reloc_arrays_substruct c[4]; struct core_reloc_arrays_substruct d[2][3]; + struct core_reloc_arrays_substruct f[1][3]; }; /* different size of array's value (struct) */ @@ -363,6 +366,29 @@ struct core_reloc_arrays___diff_arr_val_sz { int d; int __padding2; } d[1][2]; + struct { + int __padding1; + int c; + int __padding2; + } f[][2]; +}; + +struct core_reloc_arrays___equiv_zero_sz_arr { + int a[5]; + char b[2][3][4]; + struct core_reloc_arrays_substruct c[3]; + struct core_reloc_arrays_substruct d[1][2]; + /* equivalent to flexible array */ + struct core_reloc_arrays_substruct f[0][2]; +}; + +struct core_reloc_arrays___fixed_arr { + int a[5]; + char b[2][3][4]; + struct core_reloc_arrays_substruct c[3]; + struct core_reloc_arrays_substruct d[1][2]; + /* not a flexible array anymore, but within access bounds */ + struct core_reloc_arrays_substruct f[1][2]; }; struct core_reloc_arrays___err_too_small { @@ -370,6 +396,7 @@ struct core_reloc_arrays___err_too_small { char b[2][3][4]; struct core_reloc_arrays_substruct c[3]; struct core_reloc_arrays_substruct d[1][2]; + struct core_reloc_arrays_substruct f[][2]; }; struct core_reloc_arrays___err_too_shallow { @@ -377,6 +404,7 @@ struct core_reloc_arrays___err_too_shallow { char b[2][3]; /* this one lacks one dimension */ struct core_reloc_arrays_substruct c[3]; struct core_reloc_arrays_substruct d[1][2]; + struct core_reloc_arrays_substruct f[][2]; }; struct core_reloc_arrays___err_non_array { @@ -384,6 +412,7 @@ struct core_reloc_arrays___err_non_array { char b[2][3][4]; struct core_reloc_arrays_substruct c[3]; struct core_reloc_arrays_substruct d[1][2]; + struct core_reloc_arrays_substruct f[][2]; }; struct core_reloc_arrays___err_wrong_val_type { @@ -391,6 +420,16 @@ struct core_reloc_arrays___err_wrong_val_type { char b[2][3][4]; int c[3]; /* value is not a struct */ struct core_reloc_arrays_substruct d[1][2]; + struct core_reloc_arrays_substruct f[][2]; +}; + +struct core_reloc_arrays___err_bad_zero_sz_arr { + /* zero-sized array, but not at the end */ + struct core_reloc_arrays_substruct f[0][2]; + int a[5]; + char b[2][3][4]; + struct core_reloc_arrays_substruct c[3]; + struct core_reloc_arrays_substruct d[1][2]; }; /* diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c index 89951b684282..053b86f6b53f 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c @@ -18,6 +18,7 @@ struct core_reloc_arrays_output { char b123; int c1c; int d00d; + int f01c; }; struct core_reloc_arrays_substruct { @@ -30,6 +31,7 @@ struct core_reloc_arrays { char b[2][3][4]; struct core_reloc_arrays_substruct c[3]; struct core_reloc_arrays_substruct d[1][2]; + struct core_reloc_arrays_substruct f[][2]; }; #define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src) @@ -40,18 +42,16 @@ int test_core_arrays(void *ctx) struct core_reloc_arrays *in = (void *)&data.in; struct core_reloc_arrays_output *out = (void *)&data.out; - /* in->a[2] */ if (CORE_READ(&out->a2, &in->a[2])) return 1; - /* in->b[1][2][3] */ if (CORE_READ(&out->b123, &in->b[1][2][3])) return 1; - /* in->c[1].c */ if (CORE_READ(&out->c1c, &in->c[1].c)) return 1; - /* in->d[0][0].d */ if (CORE_READ(&out->d00d, &in->d[0][0].d)) return 1; + if (CORE_READ(&out->f01c, &in->f[0][1].c)) + return 1; return 0; } From a79ac2d1036a824abba982c33e938b717d1b659f Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Mon, 16 Dec 2019 17:27:38 +0900 Subject: [PATCH 056/127] libbpf: Fix build by renaming variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In btf__align_of() variable name 't' is shadowed by inner block declaration of another variable with same name. Patch renames variables in order to fix it. CC sharedobjs/btf.o btf.c: In function ‘btf__align_of’: btf.c:303:21: error: declaration of ‘t’ shadows a previous local [-Werror=shadow] 303 | int i, align = 1, t; | ^ btf.c:283:25: note: shadowed declaration is here 283 | const struct btf_type *t = btf__type_by_id(btf, id); | Fixes: 3d208f4ca111 ("libbpf: Expose btf__align_of() API") Signed-off-by: Prashant Bhole Signed-off-by: Daniel Borkmann Tested-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191216082738.28421-1-prashantbhole.linux@gmail.com --- tools/lib/bpf/btf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 520021939d81..5f04f56e1eb6 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -300,16 +300,16 @@ int btf__align_of(const struct btf *btf, __u32 id) case BTF_KIND_UNION: { const struct btf_member *m = btf_members(t); __u16 vlen = btf_vlen(t); - int i, align = 1, t; + int i, max_align = 1, align; for (i = 0; i < vlen; i++, m++) { - t = btf__align_of(btf, m->type); - if (t <= 0) - return t; - align = max(align, t); + align = btf__align_of(btf, m->type); + if (align <= 0) + return align; + max_align = max(max_align, align); } - return align; + return max_align; } default: pr_warn("unsupported BTF_KIND:%u\n", btf_kind(t)); From 159ecc002b5f55f9c58de533551a2ee552861185 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 16 Dec 2019 12:27:33 +0100 Subject: [PATCH 057/127] bpftool: Fix compilation warning on shadowed variable The ident variable has already been declared at the top of the function and doesn't need to be re-declared. Fixes: 985ead416df39 ("bpftool: Add skeleton codegen command") Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191216112733.GA28366@Omicron --- tools/bpf/bpftool/gen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index a07c80429c7a..f70088b4c19b 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -388,7 +388,7 @@ static int do_skeleton(int argc, char **argv) ); i = 0; bpf_object__for_each_map(map, obj) { - const char *ident = get_map_ident(map); + ident = get_map_ident(map); if (!ident) continue; From 5984dc6cb5aa6cce342a44f01f984fde09ed05b1 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Mon, 16 Dec 2019 16:16:19 +0900 Subject: [PATCH 058/127] samples/bpf: Reintroduce missed build targets Add xdp_redirect and per_socket_stats_example in build targets. They got removed from build targets in Makefile reorganization. Fixes: 1d97c6c2511f ("samples/bpf: Base target programs rules on Makefile.target") Signed-off-by: Prashant Bhole Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191216071619.25479-1-prashantbhole.linux@gmail.com --- samples/bpf/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 1fc42ad8ff49..8003d2823fa8 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -38,6 +38,8 @@ tprogs-y += tc_l2_redirect tprogs-y += lwt_len_hist tprogs-y += xdp_tx_iptunnel tprogs-y += test_map_in_map +tprogs-y += per_socket_stats_example +tprogs-y += xdp_redirect tprogs-y += xdp_redirect_map tprogs-y += xdp_redirect_cpu tprogs-y += xdp_monitor From 5615ed472dc04e38a1affca69059f17c1178f770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 16 Dec 2019 11:24:05 +0100 Subject: [PATCH 059/127] samples/bpf: Add missing -lz to TPROGS_LDLIBS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since libbpf now links against zlib, this needs to be included in the linker invocation for the userspace programs in samples/bpf that link statically against libbpf. Fixes: 166750bc1dd2 ("libbpf: Support libbpf-provided extern variables") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Tested-by: Björn Töpel Link: https://lore.kernel.org/bpf/20191216102405.353834-1-toke@redhat.com --- samples/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 8003d2823fa8..227077ae66ed 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -198,7 +198,7 @@ endif TPROGCFLAGS_bpf_load.o += -Wno-unused-variable -TPROGS_LDLIBS += $(LIBBPF) -lelf +TPROGS_LDLIBS += $(LIBBPF) -lelf -lz TPROGLDLIBS_tracex4 += -lrt TPROGLDLIBS_trace_output += -lrt TPROGLDLIBS_map_perf_test += -lrt From 450278977acbf494a20367c22fbb38729772d1fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 16 Dec 2019 11:38:19 +0100 Subject: [PATCH 060/127] samples/bpf: Set -fno-stack-protector when building BPF programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems Clang can in some cases turn on stack protection by default, which doesn't work with BPF. This was reported once before[0], but it seems the flag to explicitly turn off the stack protector wasn't added to the Makefile, so do that now. The symptom of this is compile errors like the following: error: :0:0: in function bpf_prog1 i32 (%struct.__sk_buff*): A call to built-in function '__stack_chk_fail' is not supported. [0] https://www.spinics.net/lists/netdev/msg556400.html Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191216103819.359535-1-toke@redhat.com --- samples/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 227077ae66ed..5b89c0370f33 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -236,6 +236,7 @@ BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \ readelf -S ./llvm_btf_verify.o | grep BTF; \ /bin/rm -f ./llvm_btf_verify.o) +BPF_EXTRA_CFLAGS += -fno-stack-protector ifneq ($(BTF_LLVM_PROBE),) BPF_EXTRA_CFLAGS += -g else From d50ecc46d18fa19ccf06e0c4d2ee8a050c665e3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 16 Dec 2019 12:07:42 +0100 Subject: [PATCH 061/127] samples/bpf: Attach XDP programs in driver mode by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When attaching XDP programs, userspace can set flags to request the attach mode (generic/SKB mode, driver mode or hw offloaded mode). If no such flags are requested, the kernel will attempt to attach in driver mode, and then silently fall back to SKB mode if this fails. The silent fallback is a major source of user confusion, as users will try to load a program on a device without XDP support, and instead of an error they will get the silent fallback behaviour, not notice, and then wonder why performance is not what they were expecting. In an attempt to combat this, let's switch all the samples to default to explicitly requesting driver-mode attach. As part of this, ensure that all the userspace utilities have a switch to enable SKB mode. For those that have a switch to request driver mode, keep it but turn it into a no-op. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Acked-by: David Ahern Link: https://lore.kernel.org/bpf/20191216110742.364456-1-toke@redhat.com --- samples/bpf/xdp1_user.c | 5 ++++- samples/bpf/xdp_adjust_tail_user.c | 5 ++++- samples/bpf/xdp_fwd_user.c | 17 ++++++++++++++--- samples/bpf/xdp_redirect_cpu_user.c | 4 ++++ samples/bpf/xdp_redirect_map_user.c | 5 ++++- samples/bpf/xdp_redirect_user.c | 5 ++++- samples/bpf/xdp_router_ipv4_user.c | 3 +++ samples/bpf/xdp_rxq_info_user.c | 4 ++++ samples/bpf/xdp_sample_pkts_user.c | 12 +++++++++--- samples/bpf/xdp_tx_iptunnel_user.c | 5 ++++- samples/bpf/xdpsock_user.c | 5 ++++- 11 files changed, 58 insertions(+), 12 deletions(-) diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index 3e553eed95a7..38a8852cb57f 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -98,7 +98,7 @@ int main(int argc, char **argv) xdp_flags |= XDP_FLAGS_SKB_MODE; break; case 'N': - xdp_flags |= XDP_FLAGS_DRV_MODE; + /* default, set below */ break; case 'F': xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; @@ -109,6 +109,9 @@ int main(int argc, char **argv) } } + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + if (optind == argc) { usage(basename(argv[0])); return 1; diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c index d86e9ad0356b..008789eb6ada 100644 --- a/samples/bpf/xdp_adjust_tail_user.c +++ b/samples/bpf/xdp_adjust_tail_user.c @@ -120,7 +120,7 @@ int main(int argc, char **argv) xdp_flags |= XDP_FLAGS_SKB_MODE; break; case 'N': - xdp_flags |= XDP_FLAGS_DRV_MODE; + /* default, set below */ break; case 'F': xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; @@ -132,6 +132,9 @@ int main(int argc, char **argv) opt_flags[opt] = 0; } + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + for (i = 0; i < strlen(optstr); i++) { if (opt_flags[(unsigned int)optstr[i]]) { fprintf(stderr, "Missing argument -%c\n", optstr[i]); diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c index 97ff1dad7669..c30f9acfdb84 100644 --- a/samples/bpf/xdp_fwd_user.c +++ b/samples/bpf/xdp_fwd_user.c @@ -27,11 +27,13 @@ #include "libbpf.h" #include +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + static int do_attach(int idx, int prog_fd, int map_fd, const char *name) { int err; - err = bpf_set_link_xdp_fd(idx, prog_fd, 0); + err = bpf_set_link_xdp_fd(idx, prog_fd, xdp_flags); if (err < 0) { printf("ERROR: failed to attach program to %s\n", name); return err; @@ -49,7 +51,7 @@ static int do_detach(int idx, const char *name) { int err; - err = bpf_set_link_xdp_fd(idx, -1, 0); + err = bpf_set_link_xdp_fd(idx, -1, xdp_flags); if (err < 0) printf("ERROR: failed to detach program from %s\n", name); @@ -83,11 +85,17 @@ int main(int argc, char **argv) int attach = 1; int ret = 0; - while ((opt = getopt(argc, argv, ":dD")) != -1) { + while ((opt = getopt(argc, argv, ":dDSF")) != -1) { switch (opt) { case 'd': attach = 0; break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; case 'D': prog_name = "xdp_fwd_direct"; break; @@ -97,6 +105,9 @@ int main(int argc, char **argv) } } + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + if (optind == argc) { usage(basename(argv[0])); return 1; diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 0da6e9e7132e..72af628529b5 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -728,6 +728,10 @@ int main(int argc, char **argv) return EXIT_FAIL_OPTION; } } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + /* Required option */ if (ifindex == -1) { fprintf(stderr, "ERR: required option --dev missing\n"); diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index f70ee33907fd..cc840661faab 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -116,7 +116,7 @@ int main(int argc, char **argv) xdp_flags |= XDP_FLAGS_SKB_MODE; break; case 'N': - xdp_flags |= XDP_FLAGS_DRV_MODE; + /* default, set below */ break; case 'F': xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; @@ -127,6 +127,9 @@ int main(int argc, char **argv) } } + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + if (optind == argc) { printf("usage: %s _IN _OUT\n", argv[0]); return 1; diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index 5440cd620607..71dff8e3382a 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -117,7 +117,7 @@ int main(int argc, char **argv) xdp_flags |= XDP_FLAGS_SKB_MODE; break; case 'N': - xdp_flags |= XDP_FLAGS_DRV_MODE; + /* default, set below */ break; case 'F': xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; @@ -128,6 +128,9 @@ int main(int argc, char **argv) } } + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + if (optind == argc) { printf("usage: %s _IN _OUT\n", argv[0]); return 1; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index 1469b66ebad1..fef286c5add2 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -662,6 +662,9 @@ int main(int ac, char **argv) } } + if (!(flags & XDP_FLAGS_SKB_MODE)) + flags |= XDP_FLAGS_DRV_MODE; + if (optind == ac) { usage(basename(argv[0])); return 1; diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index 8fc3ad01de72..fc4983fd6959 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -551,6 +551,10 @@ int main(int argc, char **argv) return EXIT_FAIL_OPTION; } } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + /* Required option */ if (ifindex == -1) { fprintf(stderr, "ERR: required option --dev missing\n"); diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c index a5760e8bf2c4..8c1af1b7372d 100644 --- a/samples/bpf/xdp_sample_pkts_user.c +++ b/samples/bpf/xdp_sample_pkts_user.c @@ -52,13 +52,13 @@ static int do_detach(int idx, const char *name) __u32 curr_prog_id = 0; int err = 0; - err = bpf_get_link_xdp_id(idx, &curr_prog_id, 0); + err = bpf_get_link_xdp_id(idx, &curr_prog_id, xdp_flags); if (err) { printf("bpf_get_link_xdp_id failed\n"); return err; } if (prog_id == curr_prog_id) { - err = bpf_set_link_xdp_fd(idx, -1, 0); + err = bpf_set_link_xdp_fd(idx, -1, xdp_flags); if (err < 0) printf("ERROR: failed to detach prog from %s\n", name); } else if (!curr_prog_id) { @@ -115,7 +115,7 @@ int main(int argc, char **argv) .prog_type = BPF_PROG_TYPE_XDP, }; struct perf_buffer_opts pb_opts = {}; - const char *optstr = "F"; + const char *optstr = "FS"; int prog_fd, map_fd, opt; struct bpf_object *obj; struct bpf_map *map; @@ -127,12 +127,18 @@ int main(int argc, char **argv) case 'F': xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; default: usage(basename(argv[0])); return 1; } } + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + if (optind == argc) { usage(basename(argv[0])); return 1; diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c index 2fe4c7f5ffe5..5f33b5530032 100644 --- a/samples/bpf/xdp_tx_iptunnel_user.c +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -231,7 +231,7 @@ int main(int argc, char **argv) xdp_flags |= XDP_FLAGS_SKB_MODE; break; case 'N': - xdp_flags |= XDP_FLAGS_DRV_MODE; + /* default, set below */ break; case 'F': xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; @@ -243,6 +243,9 @@ int main(int argc, char **argv) opt_flags[opt] = 0; } + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + for (i = 0; i < strlen(optstr); i++) { if (opt_flags[(unsigned int)optstr[i]]) { fprintf(stderr, "Missing argument -%c\n", optstr[i]); diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index a15480010828..e7829e5baaff 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -440,7 +440,7 @@ static void parse_command_line(int argc, char **argv) opt_xdp_bind_flags |= XDP_COPY; break; case 'N': - opt_xdp_flags |= XDP_FLAGS_DRV_MODE; + /* default, set below */ break; case 'n': opt_interval = atoi(optarg); @@ -474,6 +474,9 @@ static void parse_command_line(int argc, char **argv) } } + if (!(opt_xdp_flags & XDP_FLAGS_SKB_MODE)) + opt_xdp_flags |= XDP_FLAGS_DRV_MODE; + opt_ifindex = if_nametoindex(opt_if); if (!opt_ifindex) { fprintf(stderr, "ERROR: interface \"%s\" does not exist\n", From dc3a2d254782dea8e23a55f46129105ecfe787fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 16 Dec 2019 19:12:04 +0100 Subject: [PATCH 062/127] libbpf: Print hint about ulimit when getting permission denied error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Probably the single most common error newcomers to XDP are stumped by is the 'permission denied' error they get when trying to load their program and 'ulimit -l' is set too low. For examples, see [0], [1]. Since the error code is UAPI, we can't change that. Instead, this patch adds a few heuristics in libbpf and outputs an additional hint if they are met: If an EPERM is returned on map create or program load, and geteuid() shows we are root, and the current RLIMIT_MEMLOCK is not infinity, we output a hint about raising 'ulimit -l' as an additional log line. [0] https://marc.info/?l=xdp-newbies&m=157043612505624&w=2 [1] https://github.com/xdp-project/xdp-tutorial/issues/86 Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191216181204.724953-1-toke@redhat.com --- tools/lib/bpf/libbpf.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a2cc7313763a..3fe42d6b0c2f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -100,6 +101,32 @@ void libbpf_print(enum libbpf_print_level level, const char *format, ...) va_end(args); } +static void pr_perm_msg(int err) +{ + struct rlimit limit; + char buf[100]; + + if (err != -EPERM || geteuid() != 0) + return; + + err = getrlimit(RLIMIT_MEMLOCK, &limit); + if (err) + return; + + if (limit.rlim_cur == RLIM_INFINITY) + return; + + if (limit.rlim_cur < 1024) + snprintf(buf, sizeof(buf), "%lu bytes", limit.rlim_cur); + else if (limit.rlim_cur < 1024*1024) + snprintf(buf, sizeof(buf), "%.1f KiB", (double)limit.rlim_cur / 1024); + else + snprintf(buf, sizeof(buf), "%.1f MiB", (double)limit.rlim_cur / (1024*1024)); + + pr_warn("permission error while running as root; try raising 'ulimit -l'? current value: %s\n", + buf); +} + #define STRERR_BUFSIZE 128 /* Copied from tools/perf/util/util.h */ @@ -2983,6 +3010,7 @@ err_out: cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); pr_warn("failed to create map (name: '%s'): %s(%d)\n", map->name, cp, err); + pr_perm_msg(err); for (j = 0; j < i; j++) zclose(obj->maps[j].fd); return err; @@ -4381,6 +4409,7 @@ retry_load: ret = -errno; cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warn("load bpf program failed: %s\n", cp); + pr_perm_msg(ret); if (log_buf && log_buf[0] != '\0') { ret = -LIBBPF_ERRNO__VERIFY; From dbd8f6bae6f4a98aa517aeda6e5276ed4a20f571 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 16 Dec 2019 10:38:29 -0800 Subject: [PATCH 063/127] libbpf: Add zlib as a dependency in pkg-config template List zlib as another dependency of libbpf in pkg-config template. Verified it is correctly resolved to proper -lz flag: $ make DESTDIR=/tmp/libbpf-install install $ pkg-config --libs /tmp/libbpf-install/usr/local/lib64/pkgconfig/libbpf.pc -L/usr/local/lib64 -lbpf $ pkg-config --libs --static /tmp/libbpf-install/usr/local/lib64/pkgconfig/libbpf.pc -L/usr/local/lib64 -lbpf -lelf -lz Fixes: 166750bc1dd2 ("libbpf: Support libbpf-provided extern variables") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Cc: Luca Boccassi Link: https://lore.kernel.org/bpf/20191216183830.3972964-1-andriin@fb.com --- tools/lib/bpf/libbpf.pc.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.pc.template b/tools/lib/bpf/libbpf.pc.template index ac17fcef2108..b45ed534bdfb 100644 --- a/tools/lib/bpf/libbpf.pc.template +++ b/tools/lib/bpf/libbpf.pc.template @@ -8,5 +8,5 @@ Name: libbpf Description: BPF library Version: @VERSION@ Libs: -L${libdir} -lbpf -Requires.private: libelf +Requires.private: libelf zlib Cflags: -I${includedir} From 92f7440ecc9331aa1e274273d4a3cb5c58b69706 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 16 Dec 2019 22:14:25 -0800 Subject: [PATCH 064/127] selftests/bpf: More succinct Makefile output Similarly to bpftool/libbpf output, make selftests/bpf output succinct per-item output line. Output is roughly as follows: $ make ... CLANG-LLC [test_maps] pyperf600.o CLANG-LLC [test_maps] strobemeta.o CLANG-LLC [test_maps] pyperf100.o EXTRA-OBJ [test_progs] cgroup_helpers.o EXTRA-OBJ [test_progs] trace_helpers.o BINARY test_align BINARY test_verifier_log GEN-SKEL [test_progs] fexit_bpf2bpf.skel.h GEN-SKEL [test_progs] test_global_data.skel.h GEN-SKEL [test_progs] sendmsg6_prog.skel.h ... To see the actual command invocation, verbose mode can be turned on with V=1 argument: $ make V=1 ... very verbose output ... Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191217061425.2346359-1-andriin@fb.com --- tools/testing/selftests/bpf/Makefile | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ba90621617a8..c652bd84ef0e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -77,6 +77,24 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ TEST_CUSTOM_PROGS = urandom_read +# Emit succinct information message describing current building step +# $1 - generic step name (e.g., CC, LINK, etc); +# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor]; +# $3 - target (assumed to be file); only file name will be emitted; +# $4 - optional extra arg, emitted as-is, if provided. +ifeq ($(V),1) +msg = +else +msg = @$(info $(1)$(if $(2), [$(2)]) $(notdir $(3)))$(if $(4), $(4)) +endif + +# override lib.mk's default rules +OVERRIDE_TARGETS := 1 +override define CLEAN + $(call msg, CLEAN) + $(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) +endef + include ../lib.mk # Define simple and short `make test_progs`, `make test_sysctl`, etc targets @@ -89,10 +107,16 @@ $(notdir $(TEST_GEN_PROGS) \ $(TEST_GEN_PROGS_EXTENDED) \ $(TEST_CUSTOM_PROGS)): %: $(OUTPUT)/% ; +$(OUTPUT)/%:%.c + $(call msg, BINARY,,$@) + $(LINK.c) $^ $(LDLIBS) -o $@ + $(OUTPUT)/urandom_read: urandom_read.c + $(call msg, BINARY,,$@) $(CC) -o $@ $< -Wl,--build-id $(OUTPUT)/test_stub.o: test_stub.c + $(call msg, CC,,$@) $(CC) -c $(CFLAGS) -o $@ $< BPFOBJ := $(OUTPUT)/libbpf.a @@ -167,24 +191,28 @@ $(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h # $3 - CFLAGS # $4 - LDFLAGS define CLANG_BPF_BUILD_RULE + $(call msg, CLANG-LLC,$(TRUNNER_BINARY),$2) ($(CLANG) $3 -O2 -target bpf -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ $(LLC) -mattr=dwarfris -march=bpf -mcpu=probe $4 -filetype=obj -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE + $(call msg, CLANG-LLC,$(TRUNNER_BINARY),$2) ($(CLANG) $3 -O2 -target bpf -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ $(LLC) -march=bpf -mcpu=v2 $4 -filetype=obj -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but using native Clang and bpf LLC define CLANG_NATIVE_BPF_BUILD_RULE + $(call msg, CLANG-BPF,$(TRUNNER_BINARY),$2) ($(CLANG) $3 -O2 -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ $(LLC) -march=bpf -mcpu=probe $4 -filetype=obj -o $2 endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE + $(call msg, GCC-BPF,$(TRUNNER_BINARY),$2) $(BPF_GCC) $3 $4 -O2 -c $1 -o $2 endef @@ -243,6 +271,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h: \ $(TRUNNER_OUTPUT)/%.o \ | $(BPFTOOL) $(TRUNNER_OUTPUT) + $$(call msg, GEN-SKEL,$(TRUNNER_BINARY),$$@) $$(BPFTOOL) gen skeleton $$< > $$@ endif @@ -250,6 +279,7 @@ endif ifeq ($($(TRUNNER_TESTS_DIR)-tests-hdr),) $(TRUNNER_TESTS_DIR)-tests-hdr := y $(TRUNNER_TESTS_HDR): $(TRUNNER_TESTS_DIR)/*.c + $$(call msg, TEST-HDR,$(TRUNNER_BINARY),$$@) $$(shell ( cd $(TRUNNER_TESTS_DIR); \ echo '/* Generated header, do not edit */'; \ ls *.c 2> /dev/null | \ @@ -265,6 +295,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_BPF_OBJS) \ $(TRUNNER_BPF_SKELS) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) + $$(call msg, TEST-OBJ,$(TRUNNER_BINARY),$$@) cd $$(@D) && $$(CC) $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ @@ -272,17 +303,20 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_EXTRA_HDRS) \ $(TRUNNER_TESTS_HDR) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) + $$(call msg, EXTRA-OBJ,$(TRUNNER_BINARY),$$@) $$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ # only copy extra resources if in flavored build $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) ifneq ($2,) + $$(call msg, EXTRAS-CP,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES)) cp -a $$^ $(TRUNNER_OUTPUT)/ endif $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ | $(TRUNNER_BINARY)-extras + $$(call msg, BINARY,,$$@) $$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@ endef @@ -334,10 +368,12 @@ verifier/tests.h: verifier/*.c echo '#endif' \ ) > verifier/tests.h) $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) + $(call msg, BINARY,,$@) $(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ # Make sure we are able to include and link libbpf against c++. $(OUTPUT)/test_cpp: test_cpp.cpp $(BPFOBJ) + $(call msg, CXX,,$@) $(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) \ From fb9a98e160f10555936e1c7041d4efda4954d1df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 17 Dec 2019 12:28:10 +0100 Subject: [PATCH 065/127] libbpf: Fix libbpf_common.h when installing libbpf through 'make install' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes two issues with the newly introduced libbpf_common.h file: - The header failed to include for the definition of memset() - The new file was not included in the install_headers rule in the Makefile Both of these issues cause breakage when installing libbpf with 'make install' and trying to use it in applications. Fixes: 544402d4b493 ("libbpf: Extract common user-facing helpers") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191217112810.768078-1-toke@redhat.com --- tools/lib/bpf/Makefile | 1 + tools/lib/bpf/libbpf_common.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index a3718cb275f2..d4790121adf4 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -251,6 +251,7 @@ install_headers: bpf_helper_defs.h $(call do_install,libbpf.h,$(prefix)/include/bpf,644); \ $(call do_install,btf.h,$(prefix)/include/bpf,644); \ $(call do_install,libbpf_util.h,$(prefix)/include/bpf,644); \ + $(call do_install,libbpf_common.h,$(prefix)/include/bpf,644); \ $(call do_install,xsk.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_helpers.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_helper_defs.h,$(prefix)/include/bpf,644); \ diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h index 4fb833840961..a23ae1ac27eb 100644 --- a/tools/lib/bpf/libbpf_common.h +++ b/tools/lib/bpf/libbpf_common.h @@ -9,6 +9,8 @@ #ifndef __LIBBPF_LIBBPF_COMMON_H #define __LIBBPF_LIBBPF_COMMON_H +#include + #ifndef LIBBPF_API #define LIBBPF_API __attribute__((visibility("default"))) #endif From 4a3d6c6a6e4d652808729e7a2a8c3774a5a5c814 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 17 Dec 2019 15:42:28 -0800 Subject: [PATCH 066/127] libbpf: Reduce log level for custom section names Libbpf is trying to recognize BPF program type based on its section name during bpf_object__open() phase. This is not strictly enforced and user code has ability to specify/override correct BPF program type after open. But if BPF program is using custom section name, libbpf will still emit warnings, which can be quite annoying to users. This patch reduces log level of information messages emitted by libbpf if section name is not canonical. User can still get a list of all supported section names as debug-level message. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191217234228.1739308-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3fe42d6b0c2f..906bbbf7b2e4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5883,7 +5883,7 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, return 0; } - pr_warn("failed to guess program type from ELF section '%s'\n", name); + pr_debug("failed to guess program type from ELF section '%s'\n", name); type_names = libbpf_get_type_names(false); if (type_names != NULL) { pr_debug("supported section(type) names are:%s\n", type_names); @@ -6001,10 +6001,10 @@ int libbpf_attach_type_by_name(const char *name, *attach_type = section_defs[i].attach_type; return 0; } - pr_warn("failed to guess attach type based on ELF section name '%s'\n", name); + pr_debug("failed to guess attach type based on ELF section name '%s'\n", name); type_names = libbpf_get_type_names(true); if (type_names != NULL) { - pr_info("attachable section(type) names are:%s\n", type_names); + pr_debug("attachable section(type) names are:%s\n", type_names); free(type_names); } From 5dc7a8b2114479fd945956ece9875f747a996a8e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 17 Dec 2019 21:25:50 -0800 Subject: [PATCH 067/127] bpftool, selftests/bpf: Embed object file inside skeleton Embed contents of BPF object file used for BPF skeleton generation inside skeleton itself. This allows to keep BPF object file and its skeleton in sync at all times, and simpifies skeleton instantiation. Also switch existing selftests to not require BPF_EMBED_OBJ anymore. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191218052552.2915188-2-andriin@fb.com --- tools/bpf/bpftool/gen.c | 232 +++++++++++------- .../selftests/bpf/prog_tests/attach_probe.c | 4 +- .../selftests/bpf/prog_tests/core_extern.c | 4 +- .../selftests/bpf/prog_tests/fentry_fexit.c | 10 +- .../selftests/bpf/prog_tests/fentry_test.c | 7 +- tools/testing/selftests/bpf/prog_tests/mmap.c | 4 +- .../selftests/bpf/prog_tests/skeleton.c | 4 +- .../bpf/prog_tests/stacktrace_build_id.c | 4 +- .../bpf/prog_tests/stacktrace_build_id_nmi.c | 4 +- 9 files changed, 154 insertions(+), 119 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index f70088b4c19b..8d93c8f90f82 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "btf.h" @@ -261,14 +262,16 @@ static int codegen(const char *template, ...) static int do_skeleton(int argc, char **argv) { char header_guard[MAX_OBJ_NAME_LEN + sizeof("__SKEL_H__")]; - size_t i, map_cnt = 0, prog_cnt = 0; - char obj_name[MAX_OBJ_NAME_LEN]; + size_t i, map_cnt = 0, prog_cnt = 0, file_sz, mmap_sz; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + char obj_name[MAX_OBJ_NAME_LEN], *obj_data; + struct bpf_object *obj = NULL; const char *file, *ident; struct bpf_program *prog; - struct bpf_object *obj; + int fd, len, err = -1; struct bpf_map *map; struct btf *btf; - int err = -1; + struct stat st; if (!REQ_ARGS(1)) { usage(); @@ -281,14 +284,31 @@ static int do_skeleton(int argc, char **argv) return -1; } - obj = bpf_object__open_file(file, NULL); - if (IS_ERR(obj)) { - p_err("failed to open BPF object file: %ld", PTR_ERR(obj)); + if (stat(file, &st)) { + p_err("failed to stat() %s: %s", file, strerror(errno)); return -1; } - + file_sz = st.st_size; + mmap_sz = roundup(file_sz, sysconf(_SC_PAGE_SIZE)); + fd = open(file, O_RDONLY); + if (fd < 0) { + p_err("failed to open() %s: %s", file, strerror(errno)); + return -1; + } + obj_data = mmap(NULL, mmap_sz, PROT_READ, MAP_PRIVATE, fd, 0); + if (obj_data == MAP_FAILED) { + obj_data = NULL; + p_err("failed to mmap() %s: %s", file, strerror(errno)); + goto out; + } get_obj_name(obj_name, file); - get_header_guard(header_guard, obj_name); + opts.object_name = obj_name; + obj = bpf_object__open_mem(obj_data, file_sz, &opts); + if (IS_ERR(obj)) { + obj = NULL; + p_err("failed to open BPF object file: %ld", PTR_ERR(obj)); + goto out; + } bpf_object__for_each_map(map, obj) { ident = get_map_ident(map); @@ -303,8 +323,11 @@ static int do_skeleton(int argc, char **argv) prog_cnt++; } + get_header_guard(header_guard, obj_name); codegen("\ \n\ + /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ \n\ + \n\ /* THIS FILE IS AUTOGENERATED! */ \n\ #ifndef %2$s \n\ #define %2$s \n\ @@ -356,19 +379,95 @@ static int do_skeleton(int argc, char **argv) \n\ }; \n\ \n\ - static inline struct bpf_object_skeleton * \n\ - %1$s__create_skeleton(struct %1$s *obj, struct bpf_embed_data *embed)\n\ + static void \n\ + %1$s__destroy(struct %1$s *obj) \n\ + { \n\ + if (!obj) \n\ + return; \n\ + if (obj->skeleton) \n\ + bpf_object__destroy_skeleton(obj->skeleton);\n\ + free(obj); \n\ + } \n\ + \n\ + static inline int \n\ + %1$s__create_skeleton(struct %1$s *obj); \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open_opts(const struct bpf_object_open_opts *opts) \n\ + { \n\ + struct %1$s *obj; \n\ + \n\ + obj = calloc(1, sizeof(*obj)); \n\ + if (!obj) \n\ + return NULL; \n\ + if (%1$s__create_skeleton(obj)) \n\ + goto err; \n\ + if (bpf_object__open_skeleton(obj->skeleton, opts)) \n\ + goto err; \n\ + \n\ + return obj; \n\ + err: \n\ + %1$s__destroy(obj); \n\ + return NULL; \n\ + } \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open(void) \n\ + { \n\ + return %1$s__open_opts(NULL); \n\ + } \n\ + \n\ + static inline int \n\ + %1$s__load(struct %1$s *obj) \n\ + { \n\ + return bpf_object__load_skeleton(obj->skeleton); \n\ + } \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open_and_load(void) \n\ + { \n\ + struct %1$s *obj; \n\ + \n\ + obj = %1$s__open(); \n\ + if (!obj) \n\ + return NULL; \n\ + if (%1$s__load(obj)) { \n\ + %1$s__destroy(obj); \n\ + return NULL; \n\ + } \n\ + return obj; \n\ + } \n\ + \n\ + static inline int \n\ + %1$s__attach(struct %1$s *obj) \n\ + { \n\ + return bpf_object__attach_skeleton(obj->skeleton); \n\ + } \n\ + \n\ + static inline void \n\ + %1$s__detach(struct %1$s *obj) \n\ + { \n\ + return bpf_object__detach_skeleton(obj->skeleton); \n\ + } \n\ + ", + obj_name + ); + + codegen("\ + \n\ + \n\ + static inline int \n\ + %1$s__create_skeleton(struct %1$s *obj) \n\ { \n\ struct bpf_object_skeleton *s; \n\ \n\ s = calloc(1, sizeof(*s)); \n\ if (!s) \n\ - return NULL; \n\ + return -1; \n\ + obj->skeleton = s; \n\ \n\ s->sz = sizeof(*s); \n\ s->name = \"%1$s\"; \n\ - s->data = embed->data; \n\ - s->data_sz = embed->size; \n\ s->obj = &obj->obj; \n\ ", obj_name @@ -438,90 +537,45 @@ static int do_skeleton(int argc, char **argv) codegen("\ \n\ \n\ - return s; \n\ + s->data_sz = %d; \n\ + s->data = \"\\ \n\ + ", + file_sz); + + /* embed contents of BPF object file */ + for (i = 0, len = 0; i < file_sz; i++) { + int w = obj_data[i] ? 4 : 2; + + len += w; + if (len > 78) { + printf("\\\n"); + len = w; + } + if (!obj_data[i]) + printf("\\0"); + else + printf("\\x%02x", (unsigned char)obj_data[i]); + } + + codegen("\ + \n\ + \"; \n\ + \n\ + return 0; \n\ err: \n\ bpf_object__destroy_skeleton(s); \n\ - return NULL; \n\ - } \n\ - \n\ - static void \n\ - %1$s__destroy(struct %1$s *obj) \n\ - { \n\ - if (!obj) \n\ - return; \n\ - if (obj->skeleton) \n\ - bpf_object__destroy_skeleton(obj->skeleton);\n\ - free(obj); \n\ - } \n\ - \n\ - static inline struct %1$s * \n\ - %1$s__open_opts(struct bpf_embed_data *embed, const struct bpf_object_open_opts *opts)\n\ - { \n\ - struct %1$s *obj; \n\ - \n\ - obj = calloc(1, sizeof(*obj)); \n\ - if (!obj) \n\ - return NULL; \n\ - \n\ - obj->skeleton = %1$s__create_skeleton(obj, embed); \n\ - if (!obj->skeleton) \n\ - goto err; \n\ - \n\ - if (bpf_object__open_skeleton(obj->skeleton, opts)) \n\ - goto err; \n\ - \n\ - return obj; \n\ - err: \n\ - %1$s__destroy(obj); \n\ - return NULL; \n\ - } \n\ - \n\ - static inline struct %1$s * \n\ - %1$s__open(struct bpf_embed_data *embed) \n\ - { \n\ - return %1$s__open_opts(embed, NULL); \n\ - } \n\ - \n\ - static inline int \n\ - %1$s__load(struct %1$s *obj) \n\ - { \n\ - return bpf_object__load_skeleton(obj->skeleton); \n\ - } \n\ - \n\ - static inline struct %1$s * \n\ - %1$s__open_and_load(struct bpf_embed_data *embed) \n\ - { \n\ - struct %1$s *obj; \n\ - \n\ - obj = %1$s__open(embed); \n\ - if (!obj) \n\ - return NULL; \n\ - if (%1$s__load(obj)) { \n\ - %1$s__destroy(obj); \n\ - return NULL; \n\ - } \n\ - return obj; \n\ - } \n\ - \n\ - static inline int \n\ - %1$s__attach(struct %1$s *obj) \n\ - { \n\ - return bpf_object__attach_skeleton(obj->skeleton); \n\ - } \n\ - \n\ - static inline void \n\ - %1$s__detach(struct %1$s *obj) \n\ - { \n\ - return bpf_object__detach_skeleton(obj->skeleton); \n\ + return -1; \n\ } \n\ \n\ #endif /* %2$s */ \n\ ", - obj_name, header_guard - ); + obj_name, header_guard); err = 0; out: bpf_object__close(obj); + if (obj_data) + munmap(obj_data, mmap_sz); + close(fd); return err; } diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index 60da1d08daa0..5ed90ede2f1d 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -22,8 +22,6 @@ ssize_t get_base_addr() { return -EINVAL; } -BPF_EMBED_OBJ(probe, "test_attach_probe.o"); - void test_attach_probe(void) { int duration = 0; @@ -39,7 +37,7 @@ void test_attach_probe(void) return; uprobe_offset = (size_t)&get_base_addr - base_addr; - skel = test_attach_probe__open_and_load(&probe_embed); + skel = test_attach_probe__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) return; if (CHECK(!skel->bss, "check_bss", ".bss wasn't mmap()-ed\n")) diff --git a/tools/testing/selftests/bpf/prog_tests/core_extern.c b/tools/testing/selftests/bpf/prog_tests/core_extern.c index 30a7972e9012..5f03dc1de29e 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_extern.c +++ b/tools/testing/selftests/bpf/prog_tests/core_extern.c @@ -124,8 +124,6 @@ static struct test_case { { .name = "u64 (max+1)", .fails = 1, .cfg = CFG"CONFIG_ULONG=0x10000000000000000" }, }; -BPF_EMBED_OBJ(core_extern, "test_core_extern.o"); - void test_core_extern(void) { const uint32_t kern_ver = get_kernel_version(); @@ -159,7 +157,7 @@ void test_core_extern(void) opts.kconfig_path = tmp_cfg_path; } - skel = test_core_extern__open_opts(&core_extern_embed, &opts); + skel = test_core_extern__open_opts(&opts); if (CHECK(!skel, "skel_open", "skeleton open failed\n")) goto cleanup; err = test_core_extern__load(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c index 110fcf053fd0..235ac4f67f5b 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c @@ -5,10 +5,6 @@ #include "fentry_test.skel.h" #include "fexit_test.skel.h" -BPF_EMBED_OBJ(pkt_access, "test_pkt_access.o"); -BPF_EMBED_OBJ(fentry, "fentry_test.o"); -BPF_EMBED_OBJ(fexit, "fexit_test.o"); - void test_fentry_fexit(void) { struct test_pkt_access *pkt_skel = NULL; @@ -18,13 +14,13 @@ void test_fentry_fexit(void) __u32 duration = 0, retval; int err, pkt_fd, i; - pkt_skel = test_pkt_access__open_and_load(&pkt_access_embed); + pkt_skel = test_pkt_access__open_and_load(); if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n")) return; - fentry_skel = fentry_test__open_and_load(&fentry_embed); + fentry_skel = fentry_test__open_and_load(); if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) goto close_prog; - fexit_skel = fexit_test__open_and_load(&fexit_embed); + fexit_skel = fexit_test__open_and_load(); if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) goto close_prog; diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index 46a4afdf507a..e1a379f5f7d2 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -4,9 +4,6 @@ #include "test_pkt_access.skel.h" #include "fentry_test.skel.h" -BPF_EMBED_OBJ_DECLARE(pkt_access); -BPF_EMBED_OBJ_DECLARE(fentry); - void test_fentry_test(void) { struct test_pkt_access *pkt_skel = NULL; @@ -15,10 +12,10 @@ void test_fentry_test(void) __u32 duration, retval; __u64 *result; - pkt_skel = test_pkt_access__open_and_load(&pkt_access_embed); + pkt_skel = test_pkt_access__open_and_load(); if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n")) return; - fentry_skel = fentry_test__open_and_load(&fentry_embed); + fentry_skel = fentry_test__open_and_load(); if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 95a44d37ccea..16a814eb4d64 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -13,8 +13,6 @@ static size_t roundup_page(size_t sz) return (sz + page_size - 1) / page_size * page_size; } -BPF_EMBED_OBJ(test_mmap, "test_mmap.o"); - void test_mmap(void) { const size_t bss_sz = roundup_page(sizeof(struct test_mmap__bss)); @@ -30,7 +28,7 @@ void test_mmap(void) __u64 val = 0; - skel = test_mmap__open_and_load(&test_mmap_embed); + skel = test_mmap__open_and_load(); if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/skeleton.c b/tools/testing/selftests/bpf/prog_tests/skeleton.c index 151cdad3ad0d..ec6f2aec3853 100644 --- a/tools/testing/selftests/bpf/prog_tests/skeleton.c +++ b/tools/testing/selftests/bpf/prog_tests/skeleton.c @@ -10,8 +10,6 @@ struct s { #include "test_skeleton.skel.h" -BPF_EMBED_OBJ(skeleton, "test_skeleton.o"); - void test_skeleton(void) { int duration = 0, err; @@ -19,7 +17,7 @@ void test_skeleton(void) struct test_skeleton__bss *bss; struct test_skeleton__externs *exts; - skel = test_skeleton__open(&skeleton_embed); + skel = test_skeleton__open(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c index 4af8b8253f25..e8399ae50e77 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c @@ -2,8 +2,6 @@ #include #include "test_stacktrace_build_id.skel.h" -BPF_EMBED_OBJ(stacktrace_build_id, "test_stacktrace_build_id.o"); - void test_stacktrace_build_id(void) { @@ -18,7 +16,7 @@ void test_stacktrace_build_id(void) int retry = 1; retry: - skel = test_stacktrace_build_id__open_and_load(&stacktrace_build_id_embed); + skel = test_stacktrace_build_id__open_and_load(); if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 32fb03881a7b..8974450a4bdb 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -15,8 +15,6 @@ static __u64 read_perf_max_sample_freq(void) return sample_freq; } -BPF_EMBED_OBJ_DECLARE(stacktrace_build_id); - void test_stacktrace_build_id_nmi(void) { int control_map_fd, stackid_hmap_fd, stackmap_fd; @@ -37,7 +35,7 @@ void test_stacktrace_build_id_nmi(void) attr.sample_freq = read_perf_max_sample_freq(); retry: - skel = test_stacktrace_build_id__open(&stacktrace_build_id_embed); + skel = test_stacktrace_build_id__open(); if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; From 3bf3c79b772f4155312c000310abab700aba2200 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 17 Dec 2019 21:25:51 -0800 Subject: [PATCH 068/127] libbpf: Remove BPF_EMBED_OBJ macro from libbpf.h Drop BPF_EMBED_OBJ and struct bpf_embed_data now that skeleton automatically embeds contents of its source object file. While BPF_EMBED_OBJ is useful independently of skeleton, we are currently don't have any use cases utilizing it, so let's remove them until/if we need it. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191218052552.2915188-3-andriin@fb.com --- tools/lib/bpf/libbpf.h | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 6340823871e2..f7084235bae9 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -600,41 +600,6 @@ bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear); */ LIBBPF_API int libbpf_num_possible_cpus(void); -struct bpf_embed_data { - void *data; - size_t size; -}; - -#define BPF_EMBED_OBJ_DECLARE(NAME) \ -extern struct bpf_embed_data NAME##_embed; \ -extern char NAME##_data[]; \ -extern char NAME##_data_end[]; - -#define __BPF_EMBED_OBJ(NAME, PATH, SZ, ASM_TYPE) \ -asm ( \ -" .pushsection \".rodata\", \"a\", @progbits \n" \ -" .global "#NAME"_data \n" \ -#NAME"_data: \n" \ -" .incbin \"" PATH "\" \n" \ -" .global "#NAME"_data_end \n" \ -#NAME"_data_end: \n" \ -" .global "#NAME"_embed \n" \ -" .type "#NAME"_embed, @object \n" \ -" .size "#NAME"_size, "#SZ" \n" \ -" .align 8, \n" \ -#NAME"_embed: \n" \ -" "ASM_TYPE" "#NAME"_data \n" \ -" "ASM_TYPE" "#NAME"_data_end - "#NAME"_data \n" \ -" .popsection \n" \ -); \ -BPF_EMBED_OBJ_DECLARE(NAME) - -#if __SIZEOF_POINTER__ == 4 -#define BPF_EMBED_OBJ(NAME, PATH) __BPF_EMBED_OBJ(NAME, PATH, 8, ".long") -#else -#define BPF_EMBED_OBJ(NAME, PATH) __BPF_EMBED_OBJ(NAME, PATH, 16, ".quad") -#endif - struct bpf_map_skeleton { const char *name; struct bpf_map **map; From cb21ac58854605e6c15fd7da2c619823967b3140 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 17 Dec 2019 21:25:52 -0800 Subject: [PATCH 069/127] bpftool: Add gen subcommand manpage Add bpftool-gen.rst describing skeleton on the high level. Also include a small, but complete, example BPF app (BPF side, userspace side, generated skeleton) in example section to demonstrate skeleton API and its usage. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191218052552.2915188-4-andriin@fb.com --- .../bpf/bpftool/Documentation/bpftool-gen.rst | 304 ++++++++++++++++++ tools/bpf/bpftool/Documentation/bpftool.rst | 3 +- 2 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 tools/bpf/bpftool/Documentation/bpftool-gen.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst new file mode 100644 index 000000000000..b6a114bf908d --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -0,0 +1,304 @@ +================ +bpftool-gen +================ +------------------------------------------------------------------------------- +tool for BPF code-generation +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **gen** *COMMAND* + + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } + + *COMMAND* := { **skeleton | **help** } + +GEN COMMANDS +============= + +| **bpftool** **gen skeleton** *FILE* +| **bpftool** **gen help** + +DESCRIPTION +=========== + **bpftool gen skeleton** *FILE* + Generate BPF skeleton C header file for a given *FILE*. + + BPF skeleton is an alternative interface to existing libbpf + APIs for working with BPF objects. Skeleton code is intended + to significantly shorten and simplify code to load and work + with BPF programs from userspace side. Generated code is + tailored to specific input BPF object *FILE*, reflecting its + structure by listing out available maps, program, variables, + etc. Skeleton eliminates the need to lookup mentioned + components by name. Instead, if skeleton instantiation + succeeds, they are populated in skeleton structure as valid + libbpf types (e.g., struct bpf_map pointer) and can be + passed to existing generic libbpf APIs. + + In addition to simple and reliable access to maps and + programs, skeleton provides a storage for BPF links (struct + bpf_link) for each BPF program within BPF object. When + requested, supported BPF programs will be automatically + attached and resulting BPF links stored for further use by + user in pre-allocated fields in skeleton struct. For BPF + programs that can't be automatically attached by libbpf, + user can attach them manually, but store resulting BPF link + in per-program link field. All such set up links will be + automatically destroyed on BPF skeleton destruction. This + eliminates the need for users to manage links manually and + rely on libbpf support to detach programs and free up + resources. + + Another facility provided by BPF skeleton is an interface to + global variables of all supported kinds: mutable, read-only, + as well as extern ones. This interface allows to pre-setup + initial values of variables before BPF object is loaded and + verified by kernel. For non-read-only variables, the same + interface can be used to fetch values of global variables on + userspace side, even if they are modified by BPF code. + + During skeleton generation, contents of source BPF object + *FILE* is embedded within generated code and is thus not + necessary to keep around. This ensures skeleton and BPF + object file are matching 1-to-1 and always stay in sync. + Generated code is dual-licensed under LGPL-2.1 and + BSD-2-Clause licenses. + + It is a design goal and guarantee that skeleton interfaces + are interoperable with generic libbpf APIs. User should + always be able to use skeleton API to create and load BPF + object, and later use libbpf APIs to keep working with + specific maps, programs, etc. + + As part of skeleton, few custom functions are generated. + Each of them is prefixed with object name, derived from + object file name. I.e., if BPF object file name is + **example.o**, BPF object name will be **example**. The + following custom functions are provided in such case: + + - **example__open** and **example__open_opts**. + These functions are used to instantiate skeleton. It + corresponds to libbpf's **bpf_object__open()** API. + **_opts** variants accepts extra **bpf_object_open_opts** + options. + + - **example__load**. + This function creates maps, loads and verifies BPF + programs, initializes global data maps. It corresponds to + libppf's **bpf_object__load** API. + + - **example__open_and_load** combines **example__open** and + **example__load** invocations in one commonly used + operation. + + - **example__attach** and **example__detach** + This pair of functions allow to attach and detach, + correspondingly, already loaded BPF object. Only BPF + programs of types supported by libbpf for auto-attachment + will be auto-attached and their corresponding BPF links + instantiated. For other BPF programs, user can manually + create a BPF link and assign it to corresponding fields in + skeleton struct. **example__detach** will detach both + links created automatically, as well as those populated by + user manually. + + - **example__destroy** + Detach and unload BPF programs, free up all the resources + used by skeleton and BPF object. + + If BPF object has global variables, corresponding structs + with memory layout corresponding to global data data section + layout will be created. Currently supported ones are: .data, + .bss, .rodata, and .extern structs/data sections. These + data sections/structs can be used to set up initial values of + variables, if set before **example__load**. Afterwards, if + target kernel supports memory-mapped BPF arrays, same + structs can be used to fetch and update (non-read-only) + data from userspace, with same simplicity as for BPF side. + + **bpftool gen help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -V, --version + Print version number (similar to **bpftool version**). + + -j, --json + Generate JSON output. For commands that cannot produce JSON, + this option has no effect. + + -p, --pretty + Generate human-readable JSON output. Implies **-j**. + + -d, --debug + Print all logs available from libbpf, including debug-level + information. + +EXAMPLES +======== +**$ cat example.c** +:: + + #include + #include + #include + #include "bpf_helpers.h" + + const volatile int param1 = 42; + bool global_flag = true; + struct { int x; } data = {}; + + struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 128); + __type(key, int); + __type(value, long); + } my_map SEC(".maps"); + + SEC("raw_tp/sys_enter") + int handle_sys_enter(struct pt_regs *ctx) + { + static long my_static_var; + if (global_flag) + my_static_var++; + else + data.x += param1; + return 0; + } + + SEC("raw_tp/sys_exit") + int handle_sys_exit(struct pt_regs *ctx) + { + int zero = 0; + bpf_map_lookup_elem(&my_map, &zero); + return 0; + } + +This is example BPF application with two BPF programs and a mix of BPF maps +and global variables. + +**$ bpftool gen skeleton example.o** +:: + + /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + + /* THIS FILE IS AUTOGENERATED! */ + #ifndef __EXAMPLE_SKEL_H__ + #define __EXAMPLE_SKEL_H__ + + #include + #include + + struct example { + struct bpf_object_skeleton *skeleton; + struct bpf_object *obj; + struct { + struct bpf_map *rodata; + struct bpf_map *data; + struct bpf_map *bss; + struct bpf_map *my_map; + } maps; + struct { + struct bpf_program *handle_sys_enter; + struct bpf_program *handle_sys_exit; + } progs; + struct { + struct bpf_link *handle_sys_enter; + struct bpf_link *handle_sys_exit; + } links; + struct example__bss { + struct { + int x; + } data; + } *bss; + struct example__data { + _Bool global_flag; + long int handle_sys_enter_my_static_var; + } *data; + struct example__rodata { + int param1; + } *rodata; + }; + + static void example__destroy(struct example *obj); + static inline struct example *example__open_opts( + const struct bpf_object_open_opts *opts); + static inline struct example *example__open(); + static inline int example__load(struct example *obj); + static inline struct example *example__open_and_load(); + static inline int example__attach(struct example *obj); + static inline void example__detach(struct example *obj); + + #endif /* __EXAMPLE_SKEL_H__ */ + +**$ cat example_user.c** +:: + + #include "example.skel.h" + + int main() + { + struct example *skel; + int err = 0; + + skel = example__open(); + if (!skel) + goto cleanup; + + skel->rodata->param1 = 128; + + err = example__load(skel); + if (err) + goto cleanup; + + err = example__attach(skel); + if (err) + goto cleanup; + + /* all libbpf APIs are usable */ + printf("my_map name: %s\n", bpf_map__name(skel->maps.my_map)); + printf("sys_enter prog FD: %d\n", + bpf_program__fd(skel->progs.handle_sys_enter)); + + /* detach and re-attach sys_exit program */ + bpf_link__destroy(skel->links.handle_sys_exit); + skel->links.handle_sys_exit = + bpf_program__attach(skel->progs.handle_sys_exit); + + printf("my_static_var: %ld\n", + skel->bss->handle_sys_enter_my_static_var); + + cleanup: + example__destroy(skel); + return err; + } + +**# ./example_user** +:: + + my_map name: my_map + sys_enter prog FD: 8 + my_static_var: 7 + +This is a stripped-out version of skeleton generated for above example code. + +SEE ALSO +======== + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-map**\ (8), + **bpftool-prog**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 6a9c52ef84a9..34239fda69ed 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -81,4 +81,5 @@ SEE ALSO **bpftool-feature**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8) + **bpftool-btf**\ (8), + **bpftool-gen**\ (8), From 7c43e0d6a526e7734eb854fe242886f52ccd06ac Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 18 Dec 2019 13:43:14 -0800 Subject: [PATCH 070/127] bpftool: Simplify format string to not use positional args Change format string referring to just single argument out of two available. Some versions of libc can reject such format string. Reported-by: Nikita Shirokov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191218214314.2403729-1-andriin@fb.com --- tools/bpf/bpftool/gen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 8d93c8f90f82..851c465f99dc 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -567,9 +567,9 @@ static int do_skeleton(int argc, char **argv) return -1; \n\ } \n\ \n\ - #endif /* %2$s */ \n\ + #endif /* %s */ \n\ ", - obj_name, header_guard); + header_guard); err = 0; out: bpf_object__close(obj); From dacce6412e09b5dce19514e2c4e2a8aab0eb217f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 18 Dec 2019 14:17:07 -0800 Subject: [PATCH 071/127] bpftool: Work-around rst2man conversion bug Work-around what appears to be a bug in rst2man convertion tool, used to create man pages out of reStructureText-formatted documents. If text line starts with dot, rst2man will put it in resulting man file verbatim. This seems to cause man tool to interpret it as a directive/command (e.g., `.bs`), and subsequently not render entire line because it's unrecognized one. Enclose '.xxx' words in extra formatting to work around. Fixes: cb21ac588546 ("bpftool: Add gen subcommand manpage") Reported-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Date: Wed, 18 Dec 2019 12:57:47 -0800 Subject: [PATCH 072/127] bpf: Allow to change skb mark in test_run allow to pass skb's mark field into bpf_prog_test_run ctx for BPF_PROG_TYPE_SCHED_CLS prog type. that would allow to test bpf programs which are doing decision based on this field Signed-off-by: Nikita V. Shirokov Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 10 +++++++++- tools/testing/selftests/bpf/prog_tests/skb_ctx.c | 5 +++++ tools/testing/selftests/bpf/progs/test_skb_ctx.c | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 93a9c87787e0..d555c0d8657d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -251,7 +251,13 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) return 0; /* make sure the fields we don't use are zeroed */ - if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority))) + if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark))) + return -EINVAL; + + /* mark is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark), + offsetof(struct __sk_buff, priority))) return -EINVAL; /* priority is allowed */ @@ -274,6 +280,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) sizeof(struct __sk_buff))) return -EINVAL; + skb->mark = __skb->mark; skb->priority = __skb->priority; skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); @@ -301,6 +308,7 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) if (!__skb) return; + __skb->mark = skb->mark; __skb->priority = skb->priority; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c index edf5e8c7d400..c6d6b685a946 100644 --- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c +++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c @@ -13,6 +13,7 @@ void test_skb_ctx(void) .tstamp = 7, .wire_len = 100, .gso_segs = 8, + .mark = 9, }; struct bpf_prog_test_run_attr tattr = { .data_in = &pkt_v4, @@ -93,4 +94,8 @@ void test_skb_ctx(void) "ctx_out_tstamp", "skb->tstamp == %lld, expected %d\n", skb.tstamp, 8); + CHECK_ATTR(skb.mark != 10, + "ctx_out_mark", + "skb->mark == %u, expected %d\n", + skb.mark, 10); } diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c index 534fbf9a7344..e18da87fe84f 100644 --- a/tools/testing/selftests/bpf/progs/test_skb_ctx.c +++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c @@ -17,6 +17,7 @@ int process(struct __sk_buff *skb) } skb->priority++; skb->tstamp++; + skb->mark++; if (skb->wire_len != 100) return 1; From d69587062c347314a019cf6ee27f2e4b494868e1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 18 Dec 2019 14:50:39 -0800 Subject: [PATCH 073/127] libbpf: Add bpf_link__disconnect() API to preserve underlying BPF resource There are cases in which BPF resource (program, map, etc) has to outlive userspace program that "installed" it in the system in the first place. When BPF program is attached, libbpf returns bpf_link object, which is supposed to be destroyed after no longer necessary through bpf_link__destroy() API. Currently, bpf_link destruction causes both automatic detachment and frees up any resources allocated to for bpf_link in-memory representation. This is inconvenient for the case described above because of coupling of detachment and resource freeing. This patch introduces bpf_link__disconnect() API call, which marks bpf_link as disconnected from its underlying BPF resouces. This means that when bpf_link is destroyed later, all its memory resources will be freed, but BPF resource itself won't be detached. This design allows to follow strict and resource-leak-free design by default, while giving easy and straightforward way for user code to opt for keeping BPF resource attached beyond lifetime of a bpf_link. For some BPF programs (i.e., FS-based tracepoints, kprobes, raw tracepoint, etc), user has to make sure to pin BPF program to prevent kernel to automatically detach it on process exit. This should typically be achived by pinning BPF program (or map in some cases) in BPF FS. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191218225039.2668205-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 40 ++++++++++++++++++++++++++++++---------- tools/lib/bpf/libbpf.h | 1 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 906bbbf7b2e4..2a341a5b9f63 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6245,17 +6245,37 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, } struct bpf_link { + int (*detach)(struct bpf_link *link); int (*destroy)(struct bpf_link *link); + bool disconnected; }; +/* Release "ownership" of underlying BPF resource (typically, BPF program + * attached to some BPF hook, e.g., tracepoint, kprobe, etc). Disconnected + * link, when destructed through bpf_link__destroy() call won't attempt to + * detach/unregisted that BPF resource. This is useful in situations where, + * say, attached BPF program has to outlive userspace program that attached it + * in the system. Depending on type of BPF program, though, there might be + * additional steps (like pinning BPF program in BPF FS) necessary to ensure + * exit of userspace program doesn't trigger automatic detachment and clean up + * inside the kernel. + */ +void bpf_link__disconnect(struct bpf_link *link) +{ + link->disconnected = true; +} + int bpf_link__destroy(struct bpf_link *link) { - int err; + int err = 0; if (!link) return 0; - err = link->destroy(link); + if (!link->disconnected && link->detach) + err = link->detach(link); + if (link->destroy) + link->destroy(link); free(link); return err; @@ -6266,7 +6286,7 @@ struct bpf_link_fd { int fd; /* hook FD */ }; -static int bpf_link__destroy_perf_event(struct bpf_link *link) +static int bpf_link__detach_perf_event(struct bpf_link *link) { struct bpf_link_fd *l = (void *)link; int err; @@ -6298,10 +6318,10 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, return ERR_PTR(-EINVAL); } - link = malloc(sizeof(*link)); + link = calloc(1, sizeof(*link)); if (!link) return ERR_PTR(-ENOMEM); - link->link.destroy = &bpf_link__destroy_perf_event; + link->link.detach = &bpf_link__detach_perf_event; link->fd = pfd; if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) { @@ -6608,7 +6628,7 @@ out: return link; } -static int bpf_link__destroy_fd(struct bpf_link *link) +static int bpf_link__detach_fd(struct bpf_link *link) { struct bpf_link_fd *l = (void *)link; @@ -6629,10 +6649,10 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, return ERR_PTR(-EINVAL); } - link = malloc(sizeof(*link)); + link = calloc(1, sizeof(*link)); if (!link) return ERR_PTR(-ENOMEM); - link->link.destroy = &bpf_link__destroy_fd; + link->link.detach = &bpf_link__detach_fd; pfd = bpf_raw_tracepoint_open(tp_name, prog_fd); if (pfd < 0) { @@ -6668,10 +6688,10 @@ struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog) return ERR_PTR(-EINVAL); } - link = malloc(sizeof(*link)); + link = calloc(1, sizeof(*link)); if (!link) return ERR_PTR(-ENOMEM); - link->link.destroy = &bpf_link__destroy_fd; + link->link.detach = &bpf_link__detach_fd; pfd = bpf_raw_tracepoint_open(NULL, prog_fd); if (pfd < 0) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f7084235bae9..ad8c1c127933 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -215,6 +215,7 @@ LIBBPF_API void bpf_program__unload(struct bpf_program *prog); struct bpf_link; +LIBBPF_API void bpf_link__disconnect(struct bpf_link *link); LIBBPF_API int bpf_link__destroy(struct bpf_link *link); LIBBPF_API struct bpf_link * diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 1376d992a703..e3a471f38a71 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -212,6 +212,7 @@ LIBBPF_0.0.6 { LIBBPF_0.0.7 { global: btf_dump__emit_type_decl; + bpf_link__disconnect; bpf_object__find_program_by_name; bpf_object__attach_skeleton; bpf_object__destroy_skeleton; From 81bfdd087bf31a87c5ff25cc7004d5308954a35c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 18 Dec 2019 16:28:34 -0800 Subject: [PATCH 074/127] libbpf: Put Kconfig externs into .kconfig section Move Kconfig-provided externs into custom .kconfig section. Add __kconfig into bpf_helpers.h for user convenience. Update selftests accordingly. Suggested-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191219002837.3074619-2-andriin@fb.com --- tools/bpf/bpftool/gen.c | 8 +-- tools/lib/bpf/bpf_helpers.h | 2 + tools/lib/bpf/libbpf.c | 58 +++++++++++-------- .../selftests/bpf/prog_tests/skeleton.c | 16 +++-- .../selftests/bpf/progs/test_core_extern.c | 20 +++---- .../selftests/bpf/progs/test_skeleton.c | 4 +- 6 files changed, 60 insertions(+), 48 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 851c465f99dc..a14d8bc5d31d 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -83,8 +83,8 @@ static const char *get_map_ident(const struct bpf_map *map) return "rodata"; else if (str_has_suffix(name, ".bss")) return "bss"; - else if (str_has_suffix(name, ".extern")) - return "externs"; /* extern is a C keyword */ + else if (str_has_suffix(name, ".kconfig")) + return "kconfig"; else return NULL; } @@ -112,8 +112,8 @@ static int codegen_datasec_def(struct bpf_object *obj, sec_ident = "bss"; else if (strcmp(sec_name, ".rodata") == 0) sec_ident = "rodata"; - else if (strcmp(sec_name, ".extern") == 0) - sec_ident = "externs"; /* extern is a C keyword */ + else if (strcmp(sec_name, ".kconfig") == 0) + sec_ident = "kconfig"; else return 0; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index aa46700075e1..f69cc208778a 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -53,4 +53,6 @@ enum libbpf_tristate { TRI_MODULE = 2, }; +#define __kconfig __attribute__((section(".kconfig"))) + #endif diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2a341a5b9f63..0772eb3254e2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -231,21 +231,21 @@ struct bpf_program { #define DATA_SEC ".data" #define BSS_SEC ".bss" #define RODATA_SEC ".rodata" -#define EXTERN_SEC ".extern" +#define KCONFIG_SEC ".kconfig" enum libbpf_map_type { LIBBPF_MAP_UNSPEC, LIBBPF_MAP_DATA, LIBBPF_MAP_BSS, LIBBPF_MAP_RODATA, - LIBBPF_MAP_EXTERN, + LIBBPF_MAP_KCONFIG, }; static const char * const libbpf_type_to_btf_name[] = { [LIBBPF_MAP_DATA] = DATA_SEC, [LIBBPF_MAP_BSS] = BSS_SEC, [LIBBPF_MAP_RODATA] = RODATA_SEC, - [LIBBPF_MAP_EXTERN] = EXTERN_SEC, + [LIBBPF_MAP_KCONFIG] = KCONFIG_SEC, }; struct bpf_map { @@ -305,7 +305,7 @@ struct bpf_object { char *kconfig_path; struct extern_desc *externs; int nr_extern; - int extern_map_idx; + int kconfig_map_idx; bool loaded; bool has_pseudo_calls; @@ -606,7 +606,7 @@ static struct bpf_object *bpf_object__new(const char *path, obj->efile.data_shndx = -1; obj->efile.rodata_shndx = -1; obj->efile.bss_shndx = -1; - obj->extern_map_idx = -1; + obj->kconfig_map_idx = -1; obj->kern_version = get_kernel_version(); obj->loaded = false; @@ -902,11 +902,25 @@ static size_t bpf_map_mmap_sz(const struct bpf_map *map) return map_sz; } +static char *internal_map_name(struct bpf_object *obj, + enum libbpf_map_type type) +{ + char map_name[BPF_OBJ_NAME_LEN]; + const char *sfx = libbpf_type_to_btf_name[type]; + int sfx_len = max((size_t)7, strlen(sfx)); + int pfx_len = min((size_t)BPF_OBJ_NAME_LEN - sfx_len - 1, + strlen(obj->name)); + + snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name, + sfx_len, libbpf_type_to_btf_name[type]); + + return strdup(map_name); +} + static int bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, int sec_idx, void *data, size_t data_sz) { - char map_name[BPF_OBJ_NAME_LEN]; struct bpf_map_def *def; struct bpf_map *map; int err; @@ -918,9 +932,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, map->libbpf_type = type; map->sec_idx = sec_idx; map->sec_offset = 0; - snprintf(map_name, sizeof(map_name), "%.8s%.7s", obj->name, - libbpf_type_to_btf_name[type]); - map->name = strdup(map_name); + map->name = internal_map_name(obj, type); if (!map->name) { pr_warn("failed to alloc map name\n"); return -ENOMEM; @@ -931,12 +943,12 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, def->key_size = sizeof(int); def->value_size = data_sz; def->max_entries = 1; - def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_EXTERN + def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_KCONFIG ? BPF_F_RDONLY_PROG : 0; def->map_flags |= BPF_F_MMAPABLE; pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", - map_name, map->sec_idx, map->sec_offset, def->map_flags); + map->name, map->sec_idx, map->sec_offset, def->map_flags); map->mmaped = mmap(NULL, bpf_map_mmap_sz(map), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); @@ -1232,7 +1244,7 @@ out: return err; } -static int bpf_object__init_extern_map(struct bpf_object *obj) +static int bpf_object__init_kconfig_map(struct bpf_object *obj) { struct extern_desc *last_ext; size_t map_sz; @@ -1244,13 +1256,13 @@ static int bpf_object__init_extern_map(struct bpf_object *obj) last_ext = &obj->externs[obj->nr_extern - 1]; map_sz = last_ext->data_off + last_ext->sz; - err = bpf_object__init_internal_map(obj, LIBBPF_MAP_EXTERN, + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_KCONFIG, obj->efile.symbols_shndx, NULL, map_sz); if (err) return err; - obj->extern_map_idx = obj->nr_maps - 1; + obj->kconfig_map_idx = obj->nr_maps - 1; return 0; } @@ -1742,7 +1754,7 @@ static int bpf_object__init_maps(struct bpf_object *obj, err = bpf_object__init_user_maps(obj, strict); err = err ?: bpf_object__init_user_btf_maps(obj, strict, pin_root_path); err = err ?: bpf_object__init_global_data_maps(obj); - err = err ?: bpf_object__init_extern_map(obj); + err = err ?: bpf_object__init_kconfig_map(obj); if (err) return err; @@ -2269,9 +2281,9 @@ static int bpf_object__collect_externs(struct bpf_object *obj) i, ext->sym_idx, ext->data_off, ext->name); } - btf_id = btf__find_by_name(obj->btf, EXTERN_SEC); + btf_id = btf__find_by_name(obj->btf, KCONFIG_SEC); if (btf_id <= 0) { - pr_warn("no BTF info found for '%s' datasec\n", EXTERN_SEC); + pr_warn("no BTF info found for '%s' datasec\n", KCONFIG_SEC); return -ESRCH; } @@ -2361,7 +2373,7 @@ bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) else if (shndx == obj->efile.rodata_shndx) return LIBBPF_MAP_RODATA; else if (shndx == obj->efile.symbols_shndx) - return LIBBPF_MAP_EXTERN; + return LIBBPF_MAP_KCONFIG; else return LIBBPF_MAP_UNSPEC; } @@ -2908,8 +2920,8 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) return err; } - /* Freeze .rodata and .extern map as read-only from syscall side. */ - if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_EXTERN) { + /* Freeze .rodata and .kconfig map as read-only from syscall side. */ + if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG) { err = bpf_map_freeze(map->fd); if (err) { err = -errno; @@ -4264,7 +4276,7 @@ bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj) break; case RELO_EXTERN: insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; - insn[0].imm = obj->maps[obj->extern_map_idx].fd; + insn[0].imm = obj->maps[obj->kconfig_map_idx].fd; insn[1].imm = relo->sym_off; break; case RELO_CALL: @@ -4743,7 +4755,7 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, if (obj->nr_extern == 0) return 0; - data = obj->maps[obj->extern_map_idx].mmaped; + data = obj->maps[obj->kconfig_map_idx].mmaped; for (i = 0; i < obj->nr_extern; i++) { ext = &obj->externs[i]; @@ -7546,7 +7558,7 @@ int bpf_object__open_skeleton(struct bpf_object_skeleton *s, } /* externs shouldn't be pre-setup from user code */ - if (mmaped && (*map)->libbpf_type != LIBBPF_MAP_EXTERN) + if (mmaped && (*map)->libbpf_type != LIBBPF_MAP_KCONFIG) *mmaped = (*map)->mmaped; } diff --git a/tools/testing/selftests/bpf/prog_tests/skeleton.c b/tools/testing/selftests/bpf/prog_tests/skeleton.c index ec6f2aec3853..9264a2736018 100644 --- a/tools/testing/selftests/bpf/prog_tests/skeleton.c +++ b/tools/testing/selftests/bpf/prog_tests/skeleton.c @@ -15,20 +15,18 @@ void test_skeleton(void) int duration = 0, err; struct test_skeleton* skel; struct test_skeleton__bss *bss; - struct test_skeleton__externs *exts; + struct test_skeleton__kconfig *kcfg; skel = test_skeleton__open(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) return; - printf("EXTERNS BEFORE: %p\n", skel->externs); - if (CHECK(skel->externs, "skel_externs", "externs are mmaped()!\n")) + if (CHECK(skel->kconfig, "skel_kconfig", "kconfig is mmaped()!\n")) goto cleanup; err = test_skeleton__load(skel); if (CHECK(err, "skel_load", "failed to load skeleton: %d\n", err)) goto cleanup; - printf("EXTERNS AFTER: %p\n", skel->externs); bss = skel->bss; bss->in1 = 1; @@ -37,7 +35,7 @@ void test_skeleton(void) bss->in4 = 4; bss->in5.a = 5; bss->in5.b = 6; - exts = skel->externs; + kcfg = skel->kconfig; err = test_skeleton__attach(skel); if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) @@ -55,10 +53,10 @@ void test_skeleton(void) CHECK(bss->handler_out5.b != 6, "res6", "got %lld != exp %d\n", bss->handler_out5.b, 6); - CHECK(bss->bpf_syscall != exts->CONFIG_BPF_SYSCALL, "ext1", - "got %d != exp %d\n", bss->bpf_syscall, exts->CONFIG_BPF_SYSCALL); - CHECK(bss->kern_ver != exts->LINUX_KERNEL_VERSION, "ext2", - "got %d != exp %d\n", bss->kern_ver, exts->LINUX_KERNEL_VERSION); + CHECK(bss->bpf_syscall != kcfg->CONFIG_BPF_SYSCALL, "ext1", + "got %d != exp %d\n", bss->bpf_syscall, kcfg->CONFIG_BPF_SYSCALL); + CHECK(bss->kern_ver != kcfg->LINUX_KERNEL_VERSION, "ext2", + "got %d != exp %d\n", bss->kern_ver, kcfg->LINUX_KERNEL_VERSION); cleanup: test_skeleton__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/test_core_extern.c b/tools/testing/selftests/bpf/progs/test_core_extern.c index e12f09f9e881..9bfc91d9d004 100644 --- a/tools/testing/selftests/bpf/progs/test_core_extern.c +++ b/tools/testing/selftests/bpf/progs/test_core_extern.c @@ -10,16 +10,16 @@ /* non-existing BPF helper, to test dead code elimination */ static int (*bpf_missing_helper)(const void *arg1, int arg2) = (void *) 999; -extern int LINUX_KERNEL_VERSION; -extern bool CONFIG_BPF_SYSCALL; /* strong */ -extern enum libbpf_tristate CONFIG_TRISTATE __weak; -extern bool CONFIG_BOOL __weak; -extern char CONFIG_CHAR __weak; -extern uint16_t CONFIG_USHORT __weak; -extern int CONFIG_INT __weak; -extern uint64_t CONFIG_ULONG __weak; -extern const char CONFIG_STR[8] __weak; -extern uint64_t CONFIG_MISSING __weak; +extern int LINUX_KERNEL_VERSION __kconfig; +extern bool CONFIG_BPF_SYSCALL __kconfig; /* strong */ +extern enum libbpf_tristate CONFIG_TRISTATE __kconfig __weak; +extern bool CONFIG_BOOL __kconfig __weak; +extern char CONFIG_CHAR __kconfig __weak; +extern uint16_t CONFIG_USHORT __kconfig __weak; +extern int CONFIG_INT __kconfig __weak; +extern uint64_t CONFIG_ULONG __kconfig __weak; +extern const char CONFIG_STR[8] __kconfig __weak; +extern uint64_t CONFIG_MISSING __kconfig __weak; uint64_t kern_ver = -1; uint64_t bpf_syscall = -1; diff --git a/tools/testing/selftests/bpf/progs/test_skeleton.c b/tools/testing/selftests/bpf/progs/test_skeleton.c index 9caa44758ea2..4f69aac5635f 100644 --- a/tools/testing/selftests/bpf/progs/test_skeleton.c +++ b/tools/testing/selftests/bpf/progs/test_skeleton.c @@ -21,8 +21,8 @@ char out3 = 0; long long out4 = 0; int out1 = 0; -extern bool CONFIG_BPF_SYSCALL; -extern int LINUX_KERNEL_VERSION; +extern bool CONFIG_BPF_SYSCALL __kconfig; +extern int LINUX_KERNEL_VERSION __kconfig; bool bpf_syscall = 0; int kern_ver = 0; From 8601fd422148a8f7ff5f7eaf75b6703d5166332c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 18 Dec 2019 16:28:35 -0800 Subject: [PATCH 075/127] libbpf: Allow to augment system Kconfig through extra optional config Instead of all or nothing approach of overriding Kconfig file location, allow to extend it with extra values and override chosen subset of values though optional user-provided extra config, passed as a string through open options' .kconfig option. If same config key is present in both user-supplied config and Kconfig, user-supplied one wins. This allows applications to more easily test various conditions despite host kernel's real configuration. If all of BPF object's __kconfig externs are satisfied from user-supplied config, system Kconfig won't be read at all. Simplify selftests by not needing to create temporary Kconfig files. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191219002837.3074619-3-andriin@fb.com --- tools/lib/bpf/libbpf.c | 210 +++++++++++------- tools/lib/bpf/libbpf.h | 8 +- .../selftests/bpf/prog_tests/core_extern.c | 32 +-- 3 files changed, 135 insertions(+), 115 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0772eb3254e2..7b74382c7977 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -302,7 +302,7 @@ struct bpf_object { size_t nr_maps; size_t maps_cap; - char *kconfig_path; + char *kconfig; struct extern_desc *externs; int nr_extern; int kconfig_map_idx; @@ -1149,94 +1149,98 @@ static int set_ext_value_num(struct extern_desc *ext, void *ext_val, return 0; } -static int bpf_object__read_kernel_config(struct bpf_object *obj, - const char *config_path, - void *data) +static int bpf_object__process_kconfig_line(struct bpf_object *obj, + char *buf, void *data) { - char buf[PATH_MAX], *sep, *value; struct extern_desc *ext; + char *sep, *value; int len, err = 0; void *ext_val; __u64 num; + + if (strncmp(buf, "CONFIG_", 7)) + return 0; + + sep = strchr(buf, '='); + if (!sep) { + pr_warn("failed to parse '%s': no separator\n", buf); + return -EINVAL; + } + + /* Trim ending '\n' */ + len = strlen(buf); + if (buf[len - 1] == '\n') + buf[len - 1] = '\0'; + /* Split on '=' and ensure that a value is present. */ + *sep = '\0'; + if (!sep[1]) { + *sep = '='; + pr_warn("failed to parse '%s': no value\n", buf); + return -EINVAL; + } + + ext = find_extern_by_name(obj, buf); + if (!ext || ext->is_set) + return 0; + + ext_val = data + ext->data_off; + value = sep + 1; + + switch (*value) { + case 'y': case 'n': case 'm': + err = set_ext_value_tri(ext, ext_val, *value); + break; + case '"': + err = set_ext_value_str(ext, ext_val, value); + break; + default: + /* assume integer */ + err = parse_u64(value, &num); + if (err) { + pr_warn("extern %s=%s should be integer\n", + ext->name, value); + return err; + } + err = set_ext_value_num(ext, ext_val, num); + break; + } + if (err) + return err; + pr_debug("extern %s=%s\n", ext->name, value); + return 0; +} + +static int bpf_object__read_kconfig_file(struct bpf_object *obj, void *data) +{ + char buf[PATH_MAX]; + struct utsname uts; + int len, err = 0; gzFile file; - if (config_path) { - file = gzopen(config_path, "r"); - } else { - struct utsname uts; + uname(&uts); + len = snprintf(buf, PATH_MAX, "/boot/config-%s", uts.release); + if (len < 0) + return -EINVAL; + else if (len >= PATH_MAX) + return -ENAMETOOLONG; + + /* gzopen also accepts uncompressed files. */ + file = gzopen(buf, "r"); + if (!file) + file = gzopen("/proc/config.gz", "r"); - uname(&uts); - len = snprintf(buf, PATH_MAX, "/boot/config-%s", uts.release); - if (len < 0) - return -EINVAL; - else if (len >= PATH_MAX) - return -ENAMETOOLONG; - /* gzopen also accepts uncompressed files. */ - file = gzopen(buf, "r"); - if (!file) - file = gzopen("/proc/config.gz", "r"); - } if (!file) { - pr_warn("failed to read kernel config at '%s'\n", config_path); + pr_warn("failed to open system Kconfig\n"); return -ENOENT; } while (gzgets(file, buf, sizeof(buf))) { - if (strncmp(buf, "CONFIG_", 7)) - continue; - - sep = strchr(buf, '='); - if (!sep) { - err = -EINVAL; - pr_warn("failed to parse '%s': no separator\n", buf); + err = bpf_object__process_kconfig_line(obj, buf, data); + if (err) { + pr_warn("error parsing system Kconfig line '%s': %d\n", + buf, err); goto out; } - /* Trim ending '\n' */ - len = strlen(buf); - if (buf[len - 1] == '\n') - buf[len - 1] = '\0'; - /* Split on '=' and ensure that a value is present. */ - *sep = '\0'; - if (!sep[1]) { - err = -EINVAL; - *sep = '='; - pr_warn("failed to parse '%s': no value\n", buf); - goto out; - } - - ext = find_extern_by_name(obj, buf); - if (!ext) - continue; - if (ext->is_set) { - err = -EINVAL; - pr_warn("re-defining extern '%s' not allowed\n", buf); - goto out; - } - - ext_val = data + ext->data_off; - value = sep + 1; - - switch (*value) { - case 'y': case 'n': case 'm': - err = set_ext_value_tri(ext, ext_val, *value); - break; - case '"': - err = set_ext_value_str(ext, ext_val, value); - break; - default: - /* assume integer */ - err = parse_u64(value, &num); - if (err) { - pr_warn("extern %s=%s should be integer\n", - ext->name, value); - goto out; - } - err = set_ext_value_num(ext, ext_val, num); - break; - } - if (err) - goto out; - pr_debug("extern %s=%s\n", ext->name, value); } out: @@ -1244,6 +1248,33 @@ out: return err; } +static int bpf_object__read_kconfig_mem(struct bpf_object *obj, + const char *config, void *data) +{ + char buf[PATH_MAX]; + int err = 0; + FILE *file; + + file = fmemopen((void *)config, strlen(config), "r"); + if (!file) { + err = -errno; + pr_warn("failed to open in-memory Kconfig: %d\n", err); + return err; + } + + while (fgets(buf, sizeof(buf), file)) { + err = bpf_object__process_kconfig_line(obj, buf, data); + if (err) { + pr_warn("error parsing in-memory Kconfig line '%s': %d\n", + buf, err); + break; + } + } + + fclose(file); + return err; +} + static int bpf_object__init_kconfig_map(struct bpf_object *obj) { struct extern_desc *last_ext; @@ -4567,7 +4598,7 @@ static struct bpf_object * __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) { - const char *obj_name, *kconfig_path; + const char *obj_name, *kconfig; struct bpf_program *prog; struct bpf_object *obj; char tmp_name[64]; @@ -4599,10 +4630,10 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, return obj; obj->relaxed_core_relocs = OPTS_GET(opts, relaxed_core_relocs, false); - kconfig_path = OPTS_GET(opts, kconfig_path, NULL); - if (kconfig_path) { - obj->kconfig_path = strdup(kconfig_path); - if (!obj->kconfig_path) + kconfig = OPTS_GET(opts, kconfig, NULL); + if (kconfig) { + obj->kconfig = strdup(kconfig); + if (!obj->kconfig) return ERR_PTR(-ENOMEM); } @@ -4745,7 +4776,7 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) } static int bpf_object__resolve_externs(struct bpf_object *obj, - const char *config_path) + const char *extra_kconfig) { bool need_config = false; struct extern_desc *ext; @@ -4779,8 +4810,21 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, return -EINVAL; } } + if (need_config && extra_kconfig) { + err = bpf_object__read_kconfig_mem(obj, extra_kconfig, data); + if (err) + return -EINVAL; + need_config = false; + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (!ext->is_set) { + need_config = true; + break; + } + } + } if (need_config) { - err = bpf_object__read_kernel_config(obj, config_path, data); + err = bpf_object__read_kconfig_file(obj, data); if (err) return -EINVAL; } @@ -4818,7 +4862,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) obj->loaded = true; err = bpf_object__probe_caps(obj); - err = err ? : bpf_object__resolve_externs(obj, obj->kconfig_path); + err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_maps(obj); err = err ? : bpf_object__create_maps(obj); @@ -5412,7 +5456,7 @@ void bpf_object__close(struct bpf_object *obj) zfree(&map->pin_path); } - zfree(&obj->kconfig_path); + zfree(&obj->kconfig); zfree(&obj->externs); obj->nr_extern = 0; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index ad8c1c127933..fe592ef48f1b 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -85,12 +85,12 @@ struct bpf_object_open_opts { */ const char *pin_root_path; __u32 attach_prog_fd; - /* kernel config file path override (for CONFIG_ externs); can point - * to either uncompressed text file or .gz file + /* Additional kernel config content that augments and overrides + * system Kconfig for CONFIG_xxx externs. */ - const char *kconfig_path; + const char *kconfig; }; -#define bpf_object_open_opts__last_field kconfig_path +#define bpf_object_open_opts__last_field kconfig LIBBPF_API struct bpf_object *bpf_object__open(const char *path); LIBBPF_API struct bpf_object * diff --git a/tools/testing/selftests/bpf/prog_tests/core_extern.c b/tools/testing/selftests/bpf/prog_tests/core_extern.c index 5f03dc1de29e..b093787e9448 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_extern.c +++ b/tools/testing/selftests/bpf/prog_tests/core_extern.c @@ -23,19 +23,13 @@ static uint32_t get_kernel_version(void) static struct test_case { const char *name; const char *cfg; - const char *cfg_path; bool fails; struct test_core_extern__data data; } test_cases[] = { - { .name = "default search path", .cfg_path = NULL, - .data = { .bpf_syscall = true } }, - { .name = "/proc/config.gz", .cfg_path = "/proc/config.gz", - .data = { .bpf_syscall = true } }, - { .name = "missing config", .fails = true, - .cfg_path = "/proc/invalid-config.gz" }, + { .name = "default search path", .data = { .bpf_syscall = true } }, { .name = "custom values", - .cfg = "CONFIG_BPF_SYSCALL=y\n" + .cfg = "CONFIG_BPF_SYSCALL=n\n" "CONFIG_TRISTATE=m\n" "CONFIG_BOOL=y\n" "CONFIG_CHAR=100\n" @@ -45,7 +39,7 @@ static struct test_case { "CONFIG_STR=\"abracad\"\n" "CONFIG_MISSING=0", .data = { - .bpf_syscall = true, + .bpf_syscall = false, .tristate_val = TRI_MODULE, .bool_val = true, .char_val = 100, @@ -133,30 +127,14 @@ void test_core_extern(void) int n = sizeof(*skel->data) / sizeof(uint64_t); for (i = 0; i < ARRAY_SIZE(test_cases); i++) { - char tmp_cfg_path[] = "/tmp/test_core_extern_cfg.XXXXXX"; struct test_case *t = &test_cases[i]; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, - .kconfig_path = t->cfg_path, + .kconfig = t->cfg, ); if (!test__start_subtest(t->name)) continue; - if (t->cfg) { - size_t n = strlen(t->cfg) + 1; - int fd = mkstemp(tmp_cfg_path); - int written; - - if (CHECK(fd < 0, "mkstemp", "errno: %d\n", errno)) - continue; - printf("using '%s' as config file\n", tmp_cfg_path); - written = write(fd, t->cfg, n); - close(fd); - if (CHECK_FAIL(written != n)) - goto cleanup; - opts.kconfig_path = tmp_cfg_path; - } - skel = test_core_extern__open_opts(&opts); if (CHECK(!skel, "skel_open", "skeleton open failed\n")) goto cleanup; @@ -185,8 +163,6 @@ void test_core_extern(void) j, exp[j], got[j]); } cleanup: - if (t->cfg) - unlink(tmp_cfg_path); test_core_extern__destroy(skel); skel = NULL; } From 630628cb7dc39780660d8fcedc66e0298a82f9da Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 18 Dec 2019 16:28:36 -0800 Subject: [PATCH 076/127] libbpf: BTF is required when externs are present BTF is required to get type information about extern variables. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191219002837.3074619-4-andriin@fb.com --- tools/lib/bpf/libbpf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7b74382c7977..6340b81b555b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1887,7 +1887,8 @@ static void bpf_object__sanitize_btf_ext(struct bpf_object *obj) static bool bpf_object__is_btf_mandatory(const struct bpf_object *obj) { - return obj->efile.btf_maps_shndx >= 0; + return obj->efile.btf_maps_shndx >= 0 || + obj->nr_extern > 0; } static int bpf_object__init_btf(struct bpf_object *obj, From f1003b787c00fbaa4b11619c6b23a885bfce8f07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 16 Dec 2019 10:13:35 +0100 Subject: [PATCH 077/127] riscv, bpf: Fix broken BPF tail calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BPF JIT incorrectly clobbered the a0 register, and did not flag usage of s5 register when BPF stack was being used. Fixes: 2353ecc6f91f ("bpf, riscv: add BPF JIT for RV64G") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191216091343.23260-2-bjorn.topel@gmail.com --- arch/riscv/net/bpf_jit_comp.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c index 5451ef3845f2..1606ebd49666 100644 --- a/arch/riscv/net/bpf_jit_comp.c +++ b/arch/riscv/net/bpf_jit_comp.c @@ -120,6 +120,11 @@ static bool seen_reg(int reg, struct rv_jit_context *ctx) return false; } +static void mark_fp(struct rv_jit_context *ctx) +{ + __set_bit(RV_CTX_F_SEEN_S5, &ctx->flags); +} + static void mark_call(struct rv_jit_context *ctx) { __set_bit(RV_CTX_F_SEEN_CALL, &ctx->flags); @@ -596,7 +601,8 @@ static void __build_epilogue(u8 reg, struct rv_jit_context *ctx) emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx); /* Set return value. */ - emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx); + if (reg == RV_REG_RA) + emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx); emit(rv_jalr(RV_REG_ZERO, reg, 0), ctx); } @@ -1426,6 +1432,10 @@ static void build_prologue(struct rv_jit_context *ctx) { int stack_adjust = 0, store_offset, bpf_stack_adjust; + bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16); + if (bpf_stack_adjust) + mark_fp(ctx); + if (seen_reg(RV_REG_RA, ctx)) stack_adjust += 8; stack_adjust += 8; /* RV_REG_FP */ @@ -1443,7 +1453,6 @@ static void build_prologue(struct rv_jit_context *ctx) stack_adjust += 8; stack_adjust = round_up(stack_adjust, 16); - bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16); stack_adjust += bpf_stack_adjust; store_offset = stack_adjust - 8; From 7d1ef13fea2b66bb466592a7855b33181c85ec5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 16 Dec 2019 10:13:36 +0100 Subject: [PATCH 078/127] riscv, bpf: Add support for far branching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds branch relaxation to the BPF JIT, and with that support for far (offset greater than 12b) branching. The branch relaxation requires more than two passes to converge. For most programs it is three passes, but for larger programs it can be more. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Luke Nelson Link: https://lore.kernel.org/bpf/20191216091343.23260-3-bjorn.topel@gmail.com --- arch/riscv/net/bpf_jit_comp.c | 354 ++++++++++++++++++---------------- 1 file changed, 189 insertions(+), 165 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c index 1606ebd49666..e599458a9bcd 100644 --- a/arch/riscv/net/bpf_jit_comp.c +++ b/arch/riscv/net/bpf_jit_comp.c @@ -461,6 +461,11 @@ static u32 rv_amoadd_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) return rv_amo_insn(0, aq, rl, rs2, rs1, 3, rd, 0x2f); } +static u32 rv_auipc(u8 rd, u32 imm31_12) +{ + return rv_u_insn(imm31_12, rd, 0x17); +} + static bool is_12b_int(s64 val) { return -(1 << 11) <= val && val < (1 << 11); @@ -484,7 +489,7 @@ static bool is_32b_int(s64 val) static int is_12b_check(int off, int insn) { if (!is_12b_int(off)) { - pr_err("bpf-jit: insn=%d offset=%d not supported yet!\n", + pr_err("bpf-jit: insn=%d 12b < offset=%d not supported yet!\n", insn, (int)off); return -1; } @@ -494,7 +499,7 @@ static int is_12b_check(int off, int insn) static int is_13b_check(int off, int insn) { if (!is_13b_int(off)) { - pr_err("bpf-jit: insn=%d offset=%d not supported yet!\n", + pr_err("bpf-jit: insn=%d 13b < offset=%d not supported yet!\n", insn, (int)off); return -1; } @@ -504,7 +509,7 @@ static int is_13b_check(int off, int insn) static int is_21b_check(int off, int insn) { if (!is_21b_int(off)) { - pr_err("bpf-jit: insn=%d offset=%d not supported yet!\n", + pr_err("bpf-jit: insn=%d 21b < offset=%d not supported yet!\n", insn, (int)off); return -1; } @@ -550,10 +555,13 @@ static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx) emit(rv_addi(rd, rd, lower), ctx); } -static int rv_offset(int bpf_to, int bpf_from, struct rv_jit_context *ctx) +static int rv_offset(int insn, int off, struct rv_jit_context *ctx) { - int from = ctx->offset[bpf_from] - 1, to = ctx->offset[bpf_to]; + int from, to; + off++; /* BPF branch is from PC+1, RV is from PC */ + from = (insn > 0) ? ctx->offset[insn - 1] : 0; + to = (insn + off > 0) ? ctx->offset[insn + off - 1] : 0; return (to - from) << 2; } @@ -606,6 +614,109 @@ static void __build_epilogue(u8 reg, struct rv_jit_context *ctx) emit(rv_jalr(RV_REG_ZERO, reg, 0), ctx); } +/* return -1 or inverted cond */ +static int invert_bpf_cond(u8 cond) +{ + switch (cond) { + case BPF_JEQ: + return BPF_JNE; + case BPF_JGT: + return BPF_JLE; + case BPF_JLT: + return BPF_JGE; + case BPF_JGE: + return BPF_JLT; + case BPF_JLE: + return BPF_JGT; + case BPF_JNE: + return BPF_JEQ; + case BPF_JSGT: + return BPF_JSLE; + case BPF_JSLT: + return BPF_JSGE; + case BPF_JSGE: + return BPF_JSLT; + case BPF_JSLE: + return BPF_JSGT; + } + return -1; +} + +static void emit_bcc(u8 cond, u8 rd, u8 rs, int rvoff, + struct rv_jit_context *ctx) +{ + switch (cond) { + case BPF_JEQ: + emit(rv_beq(rd, rs, rvoff >> 1), ctx); + return; + case BPF_JGT: + emit(rv_bltu(rs, rd, rvoff >> 1), ctx); + return; + case BPF_JLT: + emit(rv_bltu(rd, rs, rvoff >> 1), ctx); + return; + case BPF_JGE: + emit(rv_bgeu(rd, rs, rvoff >> 1), ctx); + return; + case BPF_JLE: + emit(rv_bgeu(rs, rd, rvoff >> 1), ctx); + return; + case BPF_JNE: + emit(rv_bne(rd, rs, rvoff >> 1), ctx); + return; + case BPF_JSGT: + emit(rv_blt(rs, rd, rvoff >> 1), ctx); + return; + case BPF_JSLT: + emit(rv_blt(rd, rs, rvoff >> 1), ctx); + return; + case BPF_JSGE: + emit(rv_bge(rd, rs, rvoff >> 1), ctx); + return; + case BPF_JSLE: + emit(rv_bge(rs, rd, rvoff >> 1), ctx); + } +} + +static void emit_branch(u8 cond, u8 rd, u8 rs, int rvoff, + struct rv_jit_context *ctx) +{ + s64 upper, lower; + + if (is_13b_int(rvoff)) { + emit_bcc(cond, rd, rs, rvoff, ctx); + return; + } + + /* Adjust for jal */ + rvoff -= 4; + + /* Transform, e.g.: + * bne rd,rs,foo + * to + * beq rd,rs,<.L1> + * (auipc foo) + * jal(r) foo + * .L1 + */ + cond = invert_bpf_cond(cond); + if (is_21b_int(rvoff)) { + emit_bcc(cond, rd, rs, 8, ctx); + emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx); + return; + } + + /* 32b No need for an additional rvoff adjustment, since we + * get that from the auipc at PC', where PC = PC' + 4. + */ + upper = (rvoff + (1 << 11)) >> 12; + lower = rvoff & 0xfff; + + emit_bcc(cond, rd, rs, 12, ctx); + emit(rv_auipc(RV_REG_T1, upper), ctx); + emit(rv_jalr(RV_REG_ZERO, RV_REG_T1, lower), ctx); +} + static void emit_zext_32(u8 reg, struct rv_jit_context *ctx) { emit(rv_slli(reg, reg, 32), ctx); @@ -693,13 +804,6 @@ static void init_regs(u8 *rd, u8 *rs, const struct bpf_insn *insn, *rs = bpf_to_rv_reg(insn->src_reg, ctx); } -static int rv_offset_check(int *rvoff, s16 off, int insn, - struct rv_jit_context *ctx) -{ - *rvoff = rv_offset(insn + off, insn, ctx); - return is_13b_check(*rvoff, insn); -} - static void emit_zext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx) { emit(rv_addi(RV_REG_T2, *rd, 0), ctx); @@ -732,13 +836,19 @@ static void emit_sext_32_rd(u8 *rd, struct rv_jit_context *ctx) *rd = RV_REG_T2; } +static bool is_signed_bpf_cond(u8 cond) +{ + return cond == BPF_JSGT || cond == BPF_JSLT || + cond == BPF_JSGE || cond == BPF_JSLE; +} + static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, bool extra_pass) { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64 || BPF_CLASS(insn->code) == BPF_JMP; + int s, e, rvoff, i = insn - ctx->prog->insnsi; struct bpf_prog_aux *aux = ctx->prog->aux; - int rvoff, i = insn - ctx->prog->insnsi; u8 rd = -1, rs = -1, code = insn->code; s16 off = insn->off; s32 imm = insn->imm; @@ -1006,7 +1116,7 @@ out_be: /* JUMP off */ case BPF_JMP | BPF_JA: - rvoff = rv_offset(i + off, i, ctx); + rvoff = rv_offset(i, off, ctx); if (!is_21b_int(rvoff)) { pr_err("bpf-jit: insn=%d offset=%d not supported yet!\n", i, rvoff); @@ -1019,194 +1129,96 @@ out_be: /* IF (dst COND src) JUMP off */ case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP32 | BPF_JEQ | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_zext_32_rd_rs(&rd, &rs, ctx); - emit(rv_beq(rd, rs, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP32 | BPF_JGT | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_zext_32_rd_rs(&rd, &rs, ctx); - emit(rv_bltu(rs, rd, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP32 | BPF_JLT | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_zext_32_rd_rs(&rd, &rs, ctx); - emit(rv_bltu(rd, rs, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP32 | BPF_JGE | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_zext_32_rd_rs(&rd, &rs, ctx); - emit(rv_bgeu(rd, rs, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP32 | BPF_JLE | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_zext_32_rd_rs(&rd, &rs, ctx); - emit(rv_bgeu(rs, rd, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JNE | BPF_X: case BPF_JMP32 | BPF_JNE | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_zext_32_rd_rs(&rd, &rs, ctx); - emit(rv_bne(rd, rs, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JSGT | BPF_X: case BPF_JMP32 | BPF_JSGT | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_sext_32_rd_rs(&rd, &rs, ctx); - emit(rv_blt(rs, rd, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP32 | BPF_JSLT | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_sext_32_rd_rs(&rd, &rs, ctx); - emit(rv_blt(rd, rs, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JSGE | BPF_X: case BPF_JMP32 | BPF_JSGE | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_sext_32_rd_rs(&rd, &rs, ctx); - emit(rv_bge(rd, rs, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JSLE | BPF_X: case BPF_JMP32 | BPF_JSLE | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_sext_32_rd_rs(&rd, &rs, ctx); - emit(rv_bge(rs, rd, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP32 | BPF_JSET | BPF_X: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - if (!is64) - emit_zext_32_rd_rs(&rd, &rs, ctx); - emit(rv_and(RV_REG_T1, rd, rs), ctx); - emit(rv_bne(RV_REG_T1, RV_REG_ZERO, rvoff >> 1), ctx); + rvoff = rv_offset(i, off, ctx); + if (!is64) { + s = ctx->ninsns; + if (is_signed_bpf_cond(BPF_OP(code))) + emit_sext_32_rd_rs(&rd, &rs, ctx); + else + emit_zext_32_rd_rs(&rd, &rs, ctx); + e = ctx->ninsns; + + /* Adjust for extra insns */ + rvoff -= (e - s) << 2; + } + + if (BPF_OP(code) == BPF_JSET) { + /* Adjust for and */ + rvoff -= 4; + emit(rv_and(RV_REG_T1, rd, rs), ctx); + emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, + ctx); + } else { + emit_branch(BPF_OP(code), rd, rs, rvoff, ctx); + } break; /* IF (dst COND imm) JUMP off */ case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP32 | BPF_JEQ | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_zext_32_rd_t1(&rd, ctx); - emit(rv_beq(rd, RV_REG_T1, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP32 | BPF_JGT | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_zext_32_rd_t1(&rd, ctx); - emit(rv_bltu(RV_REG_T1, rd, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP32 | BPF_JLT | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_zext_32_rd_t1(&rd, ctx); - emit(rv_bltu(rd, RV_REG_T1, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP32 | BPF_JGE | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_zext_32_rd_t1(&rd, ctx); - emit(rv_bgeu(rd, RV_REG_T1, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP32 | BPF_JLE | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_zext_32_rd_t1(&rd, ctx); - emit(rv_bgeu(RV_REG_T1, rd, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP32 | BPF_JNE | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_zext_32_rd_t1(&rd, ctx); - emit(rv_bne(rd, RV_REG_T1, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JSGT | BPF_K: case BPF_JMP32 | BPF_JSGT | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_sext_32_rd(&rd, ctx); - emit(rv_blt(RV_REG_T1, rd, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSLT | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_sext_32_rd(&rd, ctx); - emit(rv_blt(rd, RV_REG_T1, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_sext_32_rd(&rd, ctx); - emit(rv_bge(rd, RV_REG_T1, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; - emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_sext_32_rd(&rd, ctx); - emit(rv_bge(RV_REG_T1, rd, rvoff >> 1), ctx); - break; case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP32 | BPF_JSET | BPF_K: - if (rv_offset_check(&rvoff, off, i, ctx)) - return -1; + rvoff = rv_offset(i, off, ctx); + s = ctx->ninsns; emit_imm(RV_REG_T1, imm, ctx); - if (!is64) - emit_zext_32_rd_t1(&rd, ctx); - emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx); - emit(rv_bne(RV_REG_T1, RV_REG_ZERO, rvoff >> 1), ctx); + if (!is64) { + if (is_signed_bpf_cond(BPF_OP(code))) + emit_sext_32_rd(&rd, ctx); + else + emit_zext_32_rd_t1(&rd, ctx); + } + e = ctx->ninsns; + + /* Adjust for extra insns */ + rvoff -= (e - s) << 2; + + if (BPF_OP(code) == BPF_JSET) { + /* Adjust for and */ + rvoff -= 4; + emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx); + emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, + ctx); + } else { + emit_branch(BPF_OP(code), rd, RV_REG_T1, rvoff, ctx); + } break; /* function call */ @@ -1557,6 +1569,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { bool tmp_blinded = false, extra_pass = false; struct bpf_prog *tmp, *orig_prog = prog; + int pass = 0, prev_ninsns = 0, i; struct rv_jit_data *jit_data; struct rv_jit_context *ctx; unsigned int image_size; @@ -1596,15 +1609,25 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = orig_prog; goto out_offset; } - - /* First pass generates the ctx->offset, but does not emit an image. */ - if (build_body(ctx, extra_pass)) { - prog = orig_prog; - goto out_offset; + for (i = 0; i < prog->len; i++) { + prev_ninsns += 32; + ctx->offset[i] = prev_ninsns; + } + + for (i = 0; i < 16; i++) { + pass++; + ctx->ninsns = 0; + if (build_body(ctx, extra_pass)) { + prog = orig_prog; + goto out_offset; + } + build_prologue(ctx); + ctx->epilogue_offset = ctx->ninsns; + build_epilogue(ctx); + if (ctx->ninsns == prev_ninsns) + break; + prev_ninsns = ctx->ninsns; } - build_prologue(ctx); - ctx->epilogue_offset = ctx->ninsns; - build_epilogue(ctx); /* Allocate image, now that we know the size. */ image_size = sizeof(u32) * ctx->ninsns; @@ -1619,6 +1642,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* Second, real pass, that acutally emits the image. */ ctx->insns = (u32 *)jit_data->image; skip_init_ctx: + pass++; ctx->ninsns = 0; build_prologue(ctx); @@ -1630,7 +1654,7 @@ skip_init_ctx: build_epilogue(ctx); if (bpf_jit_enable > 1) - bpf_jit_dump(prog->len, image_size, 2, ctx->insns); + bpf_jit_dump(prog->len, image_size, pass, ctx->insns); prog->bpf_func = (void *)ctx->insns; prog->jited = 1; From 29d92edd9ee8b37bd8e9a0ba7fd549f874e0d069 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 16 Dec 2019 10:13:37 +0100 Subject: [PATCH 079/127] riscv, bpf: Add support for far branching when emitting tail call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Start use the emit_branch() function in the tail call emitter in order to support far branching. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191216091343.23260-4-bjorn.topel@gmail.com --- arch/riscv/net/bpf_jit_comp.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c index e599458a9bcd..c38c95df3440 100644 --- a/arch/riscv/net/bpf_jit_comp.c +++ b/arch/riscv/net/bpf_jit_comp.c @@ -496,16 +496,6 @@ static int is_12b_check(int off, int insn) return 0; } -static int is_13b_check(int off, int insn) -{ - if (!is_13b_int(off)) { - pr_err("bpf-jit: insn=%d 13b < offset=%d not supported yet!\n", - insn, (int)off); - return -1; - } - return 0; -} - static int is_21b_check(int off, int insn) { if (!is_21b_int(off)) { @@ -744,18 +734,14 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) return -1; emit(rv_lwu(RV_REG_T1, off, RV_REG_A1), ctx); off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2; - if (is_13b_check(off, insn)) - return -1; - emit(rv_bgeu(RV_REG_A2, RV_REG_T1, off >> 1), ctx); + emit_branch(BPF_JGE, RV_REG_A2, RV_REG_T1, off, ctx); /* if (--TCC < 0) * goto out; */ emit(rv_addi(RV_REG_T1, tcc, -1), ctx); off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2; - if (is_13b_check(off, insn)) - return -1; - emit(rv_blt(RV_REG_T1, RV_REG_ZERO, off >> 1), ctx); + emit_branch(BPF_JSLT, RV_REG_T1, RV_REG_ZERO, off, ctx); /* prog = array->ptrs[index]; * if (!prog) @@ -768,9 +754,7 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) return -1; emit(rv_ld(RV_REG_T2, off, RV_REG_T2), ctx); off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2; - if (is_13b_check(off, insn)) - return -1; - emit(rv_beq(RV_REG_T2, RV_REG_ZERO, off >> 1), ctx); + emit_branch(BPF_JEQ, RV_REG_T2, RV_REG_ZERO, off, ctx); /* goto *(prog->bpf_func + 4); */ off = offsetof(struct bpf_prog, bpf_func); From 33203c02f2f8ae8e19d7dbbabf55cfd0025efede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 16 Dec 2019 10:13:38 +0100 Subject: [PATCH 080/127] riscv, bpf: Add support for far jumps and exits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit add support for far (offset > 21b) jumps and exits. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Luke Nelson Link: https://lore.kernel.org/bpf/20191216091343.23260-5-bjorn.topel@gmail.com --- arch/riscv/net/bpf_jit_comp.c | 37 ++++++++++++++++------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c index c38c95df3440..2fc0f24ad30f 100644 --- a/arch/riscv/net/bpf_jit_comp.c +++ b/arch/riscv/net/bpf_jit_comp.c @@ -496,16 +496,6 @@ static int is_12b_check(int off, int insn) return 0; } -static int is_21b_check(int off, int insn) -{ - if (!is_21b_int(off)) { - pr_err("bpf-jit: insn=%d 21b < offset=%d not supported yet!\n", - insn, (int)off); - return -1; - } - return 0; -} - static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx) { /* Note that the immediate from the add is sign-extended, @@ -820,6 +810,21 @@ static void emit_sext_32_rd(u8 *rd, struct rv_jit_context *ctx) *rd = RV_REG_T2; } +static void emit_jump_and_link(u8 rd, int rvoff, struct rv_jit_context *ctx) +{ + s64 upper, lower; + + if (is_21b_int(rvoff)) { + emit(rv_jal(rd, rvoff >> 1), ctx); + return; + } + + upper = (rvoff + (1 << 11)) >> 12; + lower = rvoff & 0xfff; + emit(rv_auipc(RV_REG_T1, upper), ctx); + emit(rv_jalr(rd, RV_REG_T1, lower), ctx); +} + static bool is_signed_bpf_cond(u8 cond) { return cond == BPF_JSGT || cond == BPF_JSLT || @@ -1101,13 +1106,7 @@ out_be: /* JUMP off */ case BPF_JMP | BPF_JA: rvoff = rv_offset(i, off, ctx); - if (!is_21b_int(rvoff)) { - pr_err("bpf-jit: insn=%d offset=%d not supported yet!\n", - i, rvoff); - return -1; - } - - emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx); + emit_jump_and_link(RV_REG_ZERO, rvoff, ctx); break; /* IF (dst COND src) JUMP off */ @@ -1245,9 +1244,7 @@ out_be: break; rvoff = epilogue_offset(ctx); - if (is_21b_check(rvoff, i)) - return -1; - emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx); + emit_jump_and_link(RV_REG_ZERO, rvoff, ctx); break; /* dst = imm64 */ From fe8322b866d560a3b3535ed9aa49df59809decbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 16 Dec 2019 10:13:39 +0100 Subject: [PATCH 081/127] riscv, bpf: Optimize BPF tail calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove one addi, and instead use the offset part of jalr. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191216091343.23260-6-bjorn.topel@gmail.com --- arch/riscv/net/bpf_jit_comp.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c index 2fc0f24ad30f..8aa19c846881 100644 --- a/arch/riscv/net/bpf_jit_comp.c +++ b/arch/riscv/net/bpf_jit_comp.c @@ -552,7 +552,7 @@ static int epilogue_offset(struct rv_jit_context *ctx) return (to - from) << 2; } -static void __build_epilogue(u8 reg, struct rv_jit_context *ctx) +static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx) { int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 8; @@ -589,9 +589,11 @@ static void __build_epilogue(u8 reg, struct rv_jit_context *ctx) emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx); /* Set return value. */ - if (reg == RV_REG_RA) + if (!is_tail_call) emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx); - emit(rv_jalr(RV_REG_ZERO, reg, 0), ctx); + emit(rv_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA, + is_tail_call ? 4 : 0), /* skip TCC init */ + ctx); } /* return -1 or inverted cond */ @@ -751,9 +753,8 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) if (is_12b_check(off, insn)) return -1; emit(rv_ld(RV_REG_T3, off, RV_REG_T2), ctx); - emit(rv_addi(RV_REG_T3, RV_REG_T3, 4), ctx); emit(rv_addi(RV_REG_TCC, RV_REG_T1, 0), ctx); - __build_epilogue(RV_REG_T3, ctx); + __build_epilogue(true, ctx); return 0; } @@ -1504,7 +1505,7 @@ static void build_prologue(struct rv_jit_context *ctx) static void build_epilogue(struct rv_jit_context *ctx) { - __build_epilogue(RV_REG_RA, ctx); + __build_epilogue(false, ctx); } static int build_body(struct rv_jit_context *ctx, bool extra_pass) From 7f3631e88ee6fb13beac333a0e9e605d2414d17f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 16 Dec 2019 10:13:40 +0100 Subject: [PATCH 082/127] riscv, bpf: Provide RISC-V specific JIT image alloc/free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit makes sure that the JIT images is kept close to the kernel text, so BPF calls can use relative calling with auipc/jalr or jal instead of loading the full 64-bit address and jalr. The BPF JIT image region is 128 MB before the kernel text. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191216091343.23260-7-bjorn.topel@gmail.com --- arch/riscv/include/asm/pgtable.h | 4 ++++ arch/riscv/net/bpf_jit_comp.c | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 7ff0ed4f292e..cc3f49415620 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -404,6 +404,10 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) +#define BPF_JIT_REGION_SIZE (SZ_128M) +#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) +#define BPF_JIT_REGION_END (VMALLOC_END) + /* * Roughly size the vmemmap space to be large enough to fit enough * struct pages to map half the virtual address space. Then diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c index 8aa19c846881..46cff093f526 100644 --- a/arch/riscv/net/bpf_jit_comp.c +++ b/arch/riscv/net/bpf_jit_comp.c @@ -1656,3 +1656,16 @@ out: tmp : orig_prog); return prog; } + +void *bpf_jit_alloc_exec(unsigned long size) +{ + return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, + BPF_JIT_REGION_END, GFP_KERNEL, + PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + __builtin_return_address(0)); +} + +void bpf_jit_free_exec(void *addr) +{ + return vfree(addr); +} From e368b64f8b0c8066115ad1da71f83f20f7bb43af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 16 Dec 2019 10:13:41 +0100 Subject: [PATCH 083/127] riscv, bpf: Optimize calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using emit_imm() and emit_jalr() which can expand to six instructions, start using jal or auipc+jalr. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191216091343.23260-8-bjorn.topel@gmail.com --- arch/riscv/net/bpf_jit_comp.c | 101 +++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 37 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c index 46cff093f526..8d7e3343a08c 100644 --- a/arch/riscv/net/bpf_jit_comp.c +++ b/arch/riscv/net/bpf_jit_comp.c @@ -811,11 +811,12 @@ static void emit_sext_32_rd(u8 *rd, struct rv_jit_context *ctx) *rd = RV_REG_T2; } -static void emit_jump_and_link(u8 rd, int rvoff, struct rv_jit_context *ctx) +static void emit_jump_and_link(u8 rd, s64 rvoff, bool force_jalr, + struct rv_jit_context *ctx) { s64 upper, lower; - if (is_21b_int(rvoff)) { + if (rvoff && is_21b_int(rvoff) && !force_jalr) { emit(rv_jal(rd, rvoff >> 1), ctx); return; } @@ -832,6 +833,28 @@ static bool is_signed_bpf_cond(u8 cond) cond == BPF_JSGE || cond == BPF_JSLE; } +static int emit_call(bool fixed, u64 addr, struct rv_jit_context *ctx) +{ + s64 off = 0; + u64 ip; + u8 rd; + + if (addr && ctx->insns) { + ip = (u64)(long)(ctx->insns + ctx->ninsns); + off = addr - ip; + if (!is_32b_int(off)) { + pr_err("bpf-jit: target call addr %pK is out of range\n", + (void *)addr); + return -ERANGE; + } + } + + emit_jump_and_link(RV_REG_RA, off, !fixed, ctx); + rd = bpf_to_rv_reg(BPF_REG_0, ctx); + emit(rv_addi(rd, RV_REG_A0, 0), ctx); + return 0; +} + static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, bool extra_pass) { @@ -1107,7 +1130,7 @@ out_be: /* JUMP off */ case BPF_JMP | BPF_JA: rvoff = rv_offset(i, off, ctx); - emit_jump_and_link(RV_REG_ZERO, rvoff, ctx); + emit_jump_and_link(RV_REG_ZERO, rvoff, false, ctx); break; /* IF (dst COND src) JUMP off */ @@ -1209,7 +1232,7 @@ out_be: case BPF_JMP | BPF_CALL: { bool fixed; - int i, ret; + int ret; u64 addr; mark_call(ctx); @@ -1217,20 +1240,9 @@ out_be: &fixed); if (ret < 0) return ret; - if (fixed) { - emit_imm(RV_REG_T1, addr, ctx); - } else { - i = ctx->ninsns; - emit_imm(RV_REG_T1, addr, ctx); - for (i = ctx->ninsns - i; i < 8; i++) { - /* nop */ - emit(rv_addi(RV_REG_ZERO, RV_REG_ZERO, 0), - ctx); - } - } - emit(rv_jalr(RV_REG_RA, RV_REG_T1, 0), ctx); - rd = bpf_to_rv_reg(BPF_REG_0, ctx); - emit(rv_addi(rd, RV_REG_A0, 0), ctx); + ret = emit_call(fixed, addr, ctx); + if (ret) + return ret; break; } /* tail call */ @@ -1245,7 +1257,7 @@ out_be: break; rvoff = epilogue_offset(ctx); - emit_jump_and_link(RV_REG_ZERO, rvoff, ctx); + emit_jump_and_link(RV_REG_ZERO, rvoff, false, ctx); break; /* dst = imm64 */ @@ -1508,7 +1520,7 @@ static void build_epilogue(struct rv_jit_context *ctx) __build_epilogue(false, ctx); } -static int build_body(struct rv_jit_context *ctx, bool extra_pass) +static int build_body(struct rv_jit_context *ctx, bool extra_pass, int *offset) { const struct bpf_prog *prog = ctx->prog; int i; @@ -1520,12 +1532,12 @@ static int build_body(struct rv_jit_context *ctx, bool extra_pass) ret = emit_insn(insn, ctx, extra_pass); if (ret > 0) { i++; - if (ctx->insns == NULL) - ctx->offset[i] = ctx->ninsns; + if (offset) + offset[i] = ctx->ninsns; continue; } - if (ctx->insns == NULL) - ctx->offset[i] = ctx->ninsns; + if (offset) + offset[i] = ctx->ninsns; if (ret) return ret; } @@ -1553,8 +1565,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) struct bpf_prog *tmp, *orig_prog = prog; int pass = 0, prev_ninsns = 0, i; struct rv_jit_data *jit_data; + unsigned int image_size = 0; struct rv_jit_context *ctx; - unsigned int image_size; if (!prog->jit_requested) return orig_prog; @@ -1599,36 +1611,51 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) for (i = 0; i < 16; i++) { pass++; ctx->ninsns = 0; - if (build_body(ctx, extra_pass)) { + if (build_body(ctx, extra_pass, ctx->offset)) { prog = orig_prog; goto out_offset; } build_prologue(ctx); ctx->epilogue_offset = ctx->ninsns; build_epilogue(ctx); - if (ctx->ninsns == prev_ninsns) - break; + + if (ctx->ninsns == prev_ninsns) { + if (jit_data->header) + break; + + image_size = sizeof(u32) * ctx->ninsns; + jit_data->header = + bpf_jit_binary_alloc(image_size, + &jit_data->image, + sizeof(u32), + bpf_fill_ill_insns); + if (!jit_data->header) { + prog = orig_prog; + goto out_offset; + } + + ctx->insns = (u32 *)jit_data->image; + /* Now, when the image is allocated, the image + * can potentially shrink more (auipc/jalr -> + * jal). + */ + } prev_ninsns = ctx->ninsns; } - /* Allocate image, now that we know the size. */ - image_size = sizeof(u32) * ctx->ninsns; - jit_data->header = bpf_jit_binary_alloc(image_size, &jit_data->image, - sizeof(u32), - bpf_fill_ill_insns); - if (!jit_data->header) { + if (i == 16) { + pr_err("bpf-jit: image did not converge in <%d passes!\n", i); + bpf_jit_binary_free(jit_data->header); prog = orig_prog; goto out_offset; } - /* Second, real pass, that acutally emits the image. */ - ctx->insns = (u32 *)jit_data->image; skip_init_ctx: pass++; ctx->ninsns = 0; build_prologue(ctx); - if (build_body(ctx, extra_pass)) { + if (build_body(ctx, extra_pass, NULL)) { bpf_jit_binary_free(jit_data->header); prog = orig_prog; goto out_offset; From eb9928bed003dd61a443d0ba51ae066429fbe735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 16 Dec 2019 10:13:42 +0100 Subject: [PATCH 084/127] riscv, bpf: Add missing uapi header for BPF_PROG_TYPE_PERF_EVENT programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add missing uapi header the BPF_PROG_TYPE_PERF_EVENT programs by exporting struct user_regs_struct instead of struct pt_regs which is in-kernel only. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191216091343.23260-9-bjorn.topel@gmail.com --- arch/riscv/include/uapi/asm/bpf_perf_event.h | 9 +++++++++ tools/include/uapi/asm/bpf_perf_event.h | 2 ++ 2 files changed, 11 insertions(+) create mode 100644 arch/riscv/include/uapi/asm/bpf_perf_event.h diff --git a/arch/riscv/include/uapi/asm/bpf_perf_event.h b/arch/riscv/include/uapi/asm/bpf_perf_event.h new file mode 100644 index 000000000000..6cb1c2823288 --- /dev/null +++ b/arch/riscv/include/uapi/asm/bpf_perf_event.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ +#define _UAPI__ASM_BPF_PERF_EVENT_H__ + +#include + +typedef struct user_regs_struct bpf_user_pt_regs_t; + +#endif /* _UAPI__ASM_BPF_PERF_EVENT_H__ */ diff --git a/tools/include/uapi/asm/bpf_perf_event.h b/tools/include/uapi/asm/bpf_perf_event.h index 13a58531e6fa..39acc149d843 100644 --- a/tools/include/uapi/asm/bpf_perf_event.h +++ b/tools/include/uapi/asm/bpf_perf_event.h @@ -2,6 +2,8 @@ #include "../../arch/arm64/include/uapi/asm/bpf_perf_event.h" #elif defined(__s390__) #include "../../arch/s390/include/uapi/asm/bpf_perf_event.h" +#elif defined(__riscv) +#include "../../arch/riscv/include/uapi/asm/bpf_perf_event.h" #else #include #endif From 34bfc10a6e7e573563a700239d302ab5944b3397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 16 Dec 2019 10:13:43 +0100 Subject: [PATCH 085/127] riscv, perf: Add arch specific perf_arch_bpf_user_pt_regs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RISC-V was missing a proper perf_arch_bpf_user_pt_regs macro for CONFIG_PERF_EVENT builds. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191216091343.23260-10-bjorn.topel@gmail.com --- arch/riscv/include/asm/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/include/asm/perf_event.h b/arch/riscv/include/asm/perf_event.h index aefbfaa6a781..0234048b12bc 100644 --- a/arch/riscv/include/asm/perf_event.h +++ b/arch/riscv/include/asm/perf_event.h @@ -82,4 +82,8 @@ struct riscv_pmu { int irq; }; +#ifdef CONFIG_PERF_EVENTS +#define perf_arch_bpf_user_pt_regs(regs) (struct user_regs_struct *)regs +#endif + #endif /* _ASM_RISCV_PERF_EVENT_H */ From 12dd14b230b3c742b80272ecb8a83cdf824625ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 19 Dec 2019 13:07:14 +0100 Subject: [PATCH 086/127] libbpf: Add missing newline in opts validation macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The error log output in the opts validation macro was missing a newline. Fixes: 2ce8450ef5a3 ("libbpf: add bpf_object__open_{file, mem} w/ extensible opts") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191219120714.928380-1-toke@redhat.com --- tools/lib/bpf/libbpf_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 7d71621bb7e8..8c3afbd97747 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -76,7 +76,7 @@ static inline bool libbpf_validate_opts(const char *opts, for (i = opts_sz; i < user_sz; i++) { if (opts[i]) { - pr_warn("%s has non-zero extra bytes", + pr_warn("%s has non-zero extra bytes\n", type_name); return false; } From 580205dd4fe800b1e95be8b6df9e2991f975a8ad Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 18 Dec 2019 18:04:42 -0800 Subject: [PATCH 087/127] selftests/bpf: Fix test_attach_probe Fix two issues in test_attach_probe: 1. it was not able to parse /proc/self/maps beyond the first line, since %s means parse string until white space. 2. offset has to be accounted for otherwise uprobed address is incorrect. Fixes: 1e8611bbdfc9 ("selftests/bpf: add kprobe/uprobe selftests") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191219020442.1922617-1-ast@kernel.org --- tools/testing/selftests/bpf/prog_tests/attach_probe.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index 5ed90ede2f1d..a0ee87c8e1ea 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -3,7 +3,7 @@ #include "test_attach_probe.skel.h" ssize_t get_base_addr() { - size_t start; + size_t start, offset; char buf[256]; FILE *f; @@ -11,10 +11,11 @@ ssize_t get_base_addr() { if (!f) return -errno; - while (fscanf(f, "%zx-%*x %s %*s\n", &start, buf) == 2) { + while (fscanf(f, "%zx-%*x %s %zx %*[^\n]\n", + &start, buf, &offset) == 3) { if (strcmp(buf, "r-xp") == 0) { fclose(f); - return start; + return start - offset; } } From b5c7d0d0f7d4a30ae96c1350df677cb849060b7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 19 Dec 2019 10:02:36 +0100 Subject: [PATCH 088/127] libbpf: Fix printing of ulimit value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Naresh pointed out that libbpf builds fail on 32-bit architectures because rlimit.rlim_cur is defined as 'unsigned long long' on those architectures. Fix this by using %zu in printf and casting to size_t. Fixes: dc3a2d254782 ("libbpf: Print hint about ulimit when getting permission denied error") Reported-by: Naresh Kamboju Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191219090236.905059-1-toke@redhat.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6340b81b555b..febbaac3daf4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -117,7 +117,7 @@ static void pr_perm_msg(int err) return; if (limit.rlim_cur < 1024) - snprintf(buf, sizeof(buf), "%lu bytes", limit.rlim_cur); + snprintf(buf, sizeof(buf), "%zu bytes", (size_t)limit.rlim_cur); else if (limit.rlim_cur < 1024*1024) snprintf(buf, sizeof(buf), "%.1f KiB", (double)limit.rlim_cur / 1024); else From 7745ff9842617323adbe24e71c495d5ebd9aa352 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 18 Dec 2019 21:21:03 -0800 Subject: [PATCH 089/127] libbpf: Fix another __u64 printf warning Fix yet another printf warning for %llu specifier on ppc64le. This time size_t casting won't work, so cast to verbose `unsigned long long`. Fixes: 166750bc1dd2 ("libbpf: Support libbpf-provided extern variables") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191219052103.3515-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index febbaac3daf4..9576a90c5a1c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1129,12 +1129,12 @@ static int set_ext_value_num(struct extern_desc *ext, void *ext_val, { if (ext->type != EXT_INT && ext->type != EXT_CHAR) { pr_warn("extern %s=%llu should be integer\n", - ext->name, value); + ext->name, (unsigned long long)value); return -EINVAL; } if (!is_ext_value_in_range(ext, value)) { pr_warn("extern %s=%llu value doesn't fit in %d bytes\n", - ext->name, value, ext->sz); + ext->name, (unsigned long long)value, ext->sz); return -ERANGE; } switch (ext->sz) { From 5bf2fc1f9c88397b125d5ec5f65b1ed9300ba59d Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Thu, 19 Dec 2019 11:57:35 -0600 Subject: [PATCH 090/127] bpf: Remove unnecessary assertion on fp_old The two callers of bpf_prog_realloc - bpf_patch_insn_single and bpf_migrate_filter dereference the struct fp_old, before passing it to the function. Thus assertion to check fp_old is unnecessary and can be removed. Signed-off-by: Aditya Pakki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191219175735.19231-1-pakki001@umn.edu --- kernel/bpf/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2ff01a716128..7622dfc36705 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -222,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, u32 pages, delta; int ret; - BUG_ON(fp_old == NULL); - size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) From 0536b85239b8440735cdd910aae0eb076ebbb439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:09:59 +0100 Subject: [PATCH 091/127] xdp: Simplify devmap cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the RCU flavor consolidation [1], call_rcu() and synchronize_rcu() waits for preempt-disable regions (NAPI) in addition to the read-side critical sections. As a result of this, the cleanup code in devmap can be simplified * There is no longer a need to flush in __dev_map_entry_free, since we know that this has been done when the call_rcu() callback is triggered. * When freeing the map, there is no need to explicitly wait for a flush. It's guaranteed to be done after the synchronize_rcu() call in dev_map_free(). The rcu_barrier() is still needed, so that the map is not freed prior the elements. [1] https://lwn.net/Articles/777036/ Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-2-bjorn.topel@gmail.com --- kernel/bpf/devmap.c | 43 +++++-------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3d3d61b5985b..b7595de6a91a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -201,7 +201,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i, cpu; + int i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -221,18 +221,6 @@ static void dev_map_free(struct bpf_map *map) /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); - /* To ensure all pending flush operations have completed wait for flush - * list to empty on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new items will be added. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; @@ -345,8 +333,7 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, - bool in_napi_ctx) +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) { struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; @@ -384,11 +371,7 @@ error: for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; - /* RX path under NAPI protection, can return frames faster */ - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); drops++; } goto out; @@ -409,7 +392,7 @@ void __dev_map_flush(struct bpf_map *map) rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) - bq_xmit_all(bq, XDP_XMIT_FLUSH, true); + bq_xmit_all(bq, XDP_XMIT_FLUSH); rcu_read_unlock(); } @@ -440,7 +423,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(bq, 0, true); + bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -509,27 +492,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } -static void dev_map_flush_old(struct bpf_dtab_netdev *dev) -{ - if (dev->dev->netdev_ops->ndo_xdp_xmit) { - struct xdp_bulk_queue *bq; - int cpu; - - rcu_read_lock(); - for_each_online_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(bq, XDP_XMIT_FLUSH, false); - } - rcu_read_unlock(); - } -} - static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - dev_map_flush_old(dev); free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); From 4bc188c7f23a5a308d7f15dda1b6a286d74e8954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:00 +0100 Subject: [PATCH 092/127] xdp: Simplify cpumap cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the RCU flavor consolidation [1], call_rcu() and synchronize_rcu() waits for preempt-disable regions (NAPI) in addition to the read-side critical sections. As a result of this, the cleanup code in cpumap can be simplified * There is no longer a need to flush in __cpu_map_entry_free, since we know that this has been done when the call_rcu() callback is triggered. * When freeing the map, there is no need to explicitly wait for a flush. It's guaranteed to be done after the synchronize_rcu() call in cpu_map_free(). [1] https://lwn.net/Articles/777036/ Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-3-bjorn.topel@gmail.com --- kernel/bpf/cpumap.c | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ef49e17ae47c..04c8eb11cd90 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -75,7 +75,7 @@ struct bpf_cpu_map { struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); +static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { @@ -399,7 +399,6 @@ free_rcu: static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; - int cpu; /* This cpu_map_entry have been disconnected from map and one * RCU graze-period have elapsed. Thus, XDP cannot queue any @@ -408,13 +407,6 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) */ rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); - /* Flush remaining packets in percpu bulkq */ - for_each_online_cpu(cpu) { - struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); - - /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(bq, false); - } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ put_cpu_map_entry(rcpu); @@ -507,7 +499,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - int cpu; u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, @@ -522,18 +513,6 @@ static void cpu_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_rcu(); - /* To ensure all pending flush operations have completed wait for flush - * list be empty on _all_ cpus. Because the above synchronize_rcu() - * ensures the map is disconnected from the program we can assume no new - * items will be added to the list. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - /* For cpu_map the remote CPUs can still be using the entries * (struct bpf_cpu_map_entry). */ @@ -599,7 +578,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq) { struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; @@ -620,10 +599,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } processed++; } @@ -646,7 +622,7 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(bq, true); + bq_flush_to_queue(bq); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -688,7 +664,7 @@ void __cpu_map_flush(struct bpf_map *map) struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { - bq_flush_to_queue(bq, true); + bq_flush_to_queue(bq); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(bq->obj->kthread); From fb5aacdf3603ccbafe1da74eecd132eb4a31e53f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:01 +0100 Subject: [PATCH 093/127] xdp: Fix graze->grace type-o in cpumap comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simple spelling fix. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-4-bjorn.topel@gmail.com --- kernel/bpf/cpumap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 04c8eb11cd90..f9deed659798 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -401,7 +401,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct bpf_cpu_map_entry *rcpu; /* This cpu_map_entry have been disconnected from map and one - * RCU graze-period have elapsed. Thus, XDP cannot queue any + * RCU grace-period have elapsed. Thus, XDP cannot queue any * new packets and cannot change/set flush_needed that can * find this entry. */ @@ -428,7 +428,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) * percpu bulkq to queue. Due to caller map_delete_elem() disable * preemption, cannot call kthread_stop() to make sure queue is empty. * Instead a work_queue is started for stopping kthread, - * cpu_map_kthread_stop, which waits for an RCU graze period before + * cpu_map_kthread_stop, which waits for an RCU grace period before * stopping kthread, emptying the queue. */ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, @@ -523,7 +523,7 @@ static void cpu_map_free(struct bpf_map *map) if (!rcpu) continue; - /* bq flush and cleanup happens after RCU graze-period */ + /* bq flush and cleanup happens after RCU grace-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } free_percpu(cmap->flush_list); From e312b9e706ed6d94f6cc9088fcd9fbd81de4525c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:02 +0100 Subject: [PATCH 094/127] xsk: Make xskmap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xskmap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all xskmaps, which simplifies __xsk_map_flush() and xsk_map_alloc(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-5-bjorn.topel@gmail.com --- include/net/xdp_sock.h | 11 ++++------- kernel/bpf/xskmap.c | 18 +++--------------- net/core/filter.c | 9 ++++----- net/xdp/xsk.c | 17 +++++++++-------- 4 files changed, 20 insertions(+), 35 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e3780e4b74e1..48594740d67c 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -72,7 +72,6 @@ struct xdp_umem { struct xsk_map { struct bpf_map map; - struct list_head __percpu *flush_list; spinlock_t lock; /* Synchronize map updates */ struct xdp_sock *xsk_map[]; }; @@ -139,9 +138,8 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); int xsk_map_inc(struct xsk_map *map); void xsk_map_put(struct xsk_map *map); -int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs); -void __xsk_map_flush(struct bpf_map *map); +int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); +void __xsk_map_flush(void); static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) @@ -369,13 +367,12 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, return 0; } -static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs) +static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; } -static inline void __xsk_map_flush(struct bpf_map *map) +static inline void __xsk_map_flush(void) { } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 90c4fce1c981..2cc5c8f4c800 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct bpf_map_memory mem; - int cpu, err, numa_node; + int err, numa_node; struct xsk_map *m; - u64 cost, size; + u64 size; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); - cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); - err = bpf_map_charge_init(&mem, cost); + err = bpf_map_charge_init(&mem, size); if (err < 0) return ERR_PTR(err); @@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); - m->flush_list = alloc_percpu(struct list_head); - if (!m->flush_list) { - bpf_map_charge_finish(&m->map.memory); - bpf_map_area_free(m); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); - return &m->map; } @@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_net(); - free_percpu(m->flush_list); bpf_map_area_free(m); } diff --git a/net/core/filter.c b/net/core/filter.c index a411f7835dee..c51678c473c5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3511,8 +3511,7 @@ err: static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_map *map, - struct xdp_buff *xdp, - u32 index) + struct xdp_buff *xdp) { int err; @@ -3537,7 +3536,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, case BPF_MAP_TYPE_XSKMAP: { struct xdp_sock *xs = fwd; - err = __xsk_map_redirect(map, xdp, xs); + err = __xsk_map_redirect(xs, xdp); return err; } default: @@ -3562,7 +3561,7 @@ void xdp_do_flush_map(void) __cpu_map_flush(map); break; case BPF_MAP_TYPE_XSKMAP: - __xsk_map_flush(map); + __xsk_map_flush(); break; default: break; @@ -3619,7 +3618,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) xdp_do_flush_map(); - err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); + err = __bpf_tx_xdp_map(dev, fwd, map, xdp); if (unlikely(err)) goto err; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 956793893c9d..e45c27f5cfca 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -31,6 +31,8 @@ #define TX_BATCH_SIZE 16 +static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); + bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && @@ -264,11 +266,9 @@ out_unlock: return err; } -int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs) +int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; err = xsk_rcv(xs, xdp); @@ -281,10 +281,9 @@ int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, return 0; } -void __xsk_map_flush(struct bpf_map *map) +void __xsk_map_flush(void) { - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { @@ -1177,7 +1176,7 @@ static struct pernet_operations xsk_net_ops = { static int __init xsk_init(void) { - int err; + int err, cpu; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) @@ -1195,6 +1194,8 @@ static int __init xsk_init(void) if (err) goto out_pernet; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); return 0; out_pernet: From 96360004b8628541f5d05a845ea213267db0b1a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:03 +0100 Subject: [PATCH 095/127] xdp: Make devmap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The devmap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all devmaps, which simplifies __dev_map_flush() and dev_map_init_map(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-6-bjorn.topel@gmail.com --- include/linux/bpf.h | 4 ++-- kernel/bpf/devmap.c | 35 +++++++++++++---------------------- net/core/filter.c | 2 +- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d467983e61bb..31191804ca09 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -959,7 +959,7 @@ struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); -void __dev_map_flush(struct bpf_map *map); +void __dev_map_flush(void); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -1068,7 +1068,7 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map return NULL; } -static inline void __dev_map_flush(struct bpf_map *map) +static inline void __dev_map_flush(void) { } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index b7595de6a91a..da9c832fc5c8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,7 +75,6 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ - struct list_head __percpu *flush_list; struct list_head list; /* these are only used for DEVMAP_HASH type maps */ @@ -85,6 +84,7 @@ struct bpf_dtab { u32 n_buckets; }; +static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -109,8 +109,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - int err, cpu; - u64 cost; + u64 cost = 0; + int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -125,9 +125,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); - /* make sure page count doesn't overflow */ - cost = (u64) sizeof(struct list_head) * num_possible_cpus(); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -143,17 +140,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (err) return -EINVAL; - dtab->flush_list = alloc_percpu(struct list_head); - if (!dtab->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_percpu; + goto free_charge; spin_lock_init(&dtab->index_lock); } else { @@ -161,13 +151,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_percpu; + goto free_charge; } return 0; -free_percpu: - free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); return -ENOMEM; @@ -254,7 +242,6 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - free_percpu(dtab->flush_list); kfree(dtab); } @@ -384,10 +371,9 @@ error: * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(struct bpf_map *map) +void __dev_map_flush(void) { - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); @@ -419,7 +405,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { - struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -777,10 +763,15 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + int cpu; + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); return 0; } diff --git a/net/core/filter.c b/net/core/filter.c index c51678c473c5..b7570cb84902 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3555,7 +3555,7 @@ void xdp_do_flush_map(void) switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: - __dev_map_flush(map); + __dev_map_flush(); break; case BPF_MAP_TYPE_CPUMAP: __cpu_map_flush(map); From cdfafe98cabefeedbbc65af5c191c59745c03298 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:04 +0100 Subject: [PATCH 096/127] xdp: Make cpumap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpumap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all devmaps, which simplifies __cpu_map_flush() and cpu_map_alloc(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-7-bjorn.topel@gmail.com --- include/linux/bpf.h | 4 ++-- kernel/bpf/cpumap.c | 36 ++++++++++++++++++------------------ net/core/filter.c | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 31191804ca09..8f3e00c84f39 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -966,7 +966,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); -void __cpu_map_flush(struct bpf_map *map); +void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -1097,7 +1097,7 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) return NULL; } -static inline void __cpu_map_flush(struct bpf_map *map) +static inline void __cpu_map_flush(void) { } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f9deed659798..70f71b154fa5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -72,17 +72,18 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - struct list_head __percpu *flush_list; }; +static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); + static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; - int ret, cpu; u64 cost; + int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - cmap->flush_list = alloc_percpu(struct list_head); - if (!cmap->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) - goto free_percpu; + goto free_charge; return &cmap->map; -free_percpu: - free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -526,7 +517,6 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU grace-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -618,7 +608,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq) */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) @@ -657,10 +647,9 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_flush(struct bpf_map *map) +void __cpu_map_flush(void) { - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -670,3 +659,14 @@ void __cpu_map_flush(struct bpf_map *map) wake_up_process(bq->obj->kthread); } } + +static int __init cpu_map_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); + return 0; +} + +subsys_initcall(cpu_map_init); diff --git a/net/core/filter.c b/net/core/filter.c index b7570cb84902..c706325b3e66 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3558,7 +3558,7 @@ void xdp_do_flush_map(void) __dev_map_flush(); break; case BPF_MAP_TYPE_CPUMAP: - __cpu_map_flush(map); + __cpu_map_flush(); break; case BPF_MAP_TYPE_XSKMAP: __xsk_map_flush(); From 332f22a60e4c3492d4953cd6f7aaa4e8bd0bba97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:05 +0100 Subject: [PATCH 097/127] xdp: Remove map_to_flush and map swap detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that all XDP maps that can be used with bpf_redirect_map() tracks entries to be flushed in a global fashion, there is not need to track that the map has changed and flush from xdp_do_generic_map() anymore. All entries will be flushed in xdp_do_flush_map(). This means that the map_to_flush can be removed, and the corresponding checks. Moving the flush logic to one place, xdp_do_flush_map(), give a bulking behavior and performance boost. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-8-bjorn.topel@gmail.com --- include/linux/filter.h | 1 - net/core/filter.c | 27 +++------------------------ 2 files changed, 3 insertions(+), 25 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 37ac7025031d..69d6706fc889 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -592,7 +592,6 @@ struct bpf_redirect_info { u32 tgt_index; void *tgt_value; struct bpf_map *map; - struct bpf_map *map_to_flush; u32 kern_flags; }; diff --git a/net/core/filter.c b/net/core/filter.c index c706325b3e66..d9caa3e57ea1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3547,26 +3547,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, void xdp_do_flush_map(void) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = ri->map_to_flush; - - ri->map_to_flush = NULL; - if (map) { - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - __dev_map_flush(); - break; - case BPF_MAP_TYPE_CPUMAP: - __cpu_map_flush(); - break; - case BPF_MAP_TYPE_XSKMAP: - __xsk_map_flush(); - break; - default: - break; - } - } + __dev_map_flush(); + __cpu_map_flush(); + __xsk_map_flush(); } EXPORT_SYMBOL_GPL(xdp_do_flush_map); @@ -3615,14 +3598,10 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) - xdp_do_flush_map(); - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); if (unlikely(err)) goto err; - ri->map_to_flush = map; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); return 0; err: From 1170beaa3fa3c3381fd820e9d05b84d168fe1dab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:06 +0100 Subject: [PATCH 098/127] xdp: Simplify __bpf_tx_xdp_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The explicit error checking is not needed. Simply return the error instead. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-9-bjorn.topel@gmail.com --- net/core/filter.c | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index d9caa3e57ea1..217af9974c86 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3510,35 +3510,16 @@ err: } static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, - struct bpf_map *map, - struct xdp_buff *xdp) + struct bpf_map *map, struct xdp_buff *xdp) { - int err; - switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: { - struct bpf_dtab_netdev *dst = fwd; - - err = dev_map_enqueue(dst, xdp, dev_rx); - if (unlikely(err)) - return err; - break; - } - case BPF_MAP_TYPE_CPUMAP: { - struct bpf_cpu_map_entry *rcpu = fwd; - - err = cpu_map_enqueue(rcpu, xdp, dev_rx); - if (unlikely(err)) - return err; - break; - } - case BPF_MAP_TYPE_XSKMAP: { - struct xdp_sock *xs = fwd; - - err = __xsk_map_redirect(xs, xdp); - return err; - } + case BPF_MAP_TYPE_DEVMAP_HASH: + return dev_map_enqueue(fwd, xdp, dev_rx); + case BPF_MAP_TYPE_CPUMAP: + return cpu_map_enqueue(fwd, xdp, dev_rx); + case BPF_MAP_TYPE_XSKMAP: + return __xsk_map_redirect(fwd, xdp); default: break; } From 1020c1f24a946e7d5d8a67db741b20efcd2cefc5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:33 -0800 Subject: [PATCH 099/127] bpf: Simplify __cgroup_bpf_attach __cgroup_bpf_attach has a lot of identical code to handle two scenarios: BPF_F_ALLOW_MULTI is set and unset. Simplify it by splitting the two main steps: * First, the decision is made whether a new bpf_prog_list entry should be allocated or existing entry should be reused for the new program. This decision is saved in replace_pl pointer; * Next, replace_pl pointer is used to handle both possible states of BPF_F_ALLOW_MULTI flag (set / unset) instead of doing similar work for them separately. This splitting, in turn, allows to make further simplifications: * The check for attaching same program twice in BPF_F_ALLOW_MULTI mode can be done before allocating cgroup storage, so that if user tries to attach same program twice no alloc/free happens as it was before; * pl_was_allocated becomes redundant so it's removed. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/c6193db6fe630797110b0d3ff06c125d093b834c.1576741281.git.rdna@fb.com --- kernel/bpf/cgroup.c | 62 +++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9f90d3c92bda..e8cbdd1be687 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -295,9 +295,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_prog_list *pl, *replace_pl = NULL; enum bpf_cgroup_storage_type stype; - struct bpf_prog_list *pl; - bool pl_was_allocated; int err; if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) @@ -317,6 +316,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; + } + } else if (!list_empty(progs)) { + replace_pl = list_first_entry(progs, typeof(*pl), node); + } + for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(storage[stype])) { @@ -327,52 +336,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } } - if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) { - if (pl->prog == prog) { - /* disallow attaching the same prog twice */ - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -EINVAL; - } + if (replace_pl) { + pl = replace_pl; + old_prog = pl->prog; + for_each_cgroup_storage_type(stype) { + old_storage[stype] = pl->storage[stype]; + bpf_cgroup_storage_unlink(old_storage[stype]); } - + } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } - - pl_was_allocated = true; - pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; list_add_tail(&pl->node, progs); - } else { - if (list_empty(progs)) { - pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) { - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -ENOMEM; - } - pl_was_allocated = true; - list_add_tail(&pl->node, progs); - } else { - pl = list_first_entry(progs, typeof(*pl), node); - old_prog = pl->prog; - for_each_cgroup_storage_type(stype) { - old_storage[stype] = pl->storage[stype]; - bpf_cgroup_storage_unlink(old_storage[stype]); - } - pl_was_allocated = false; - } - pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; } + pl->prog = prog; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; + cgrp->bpf.flags[type] = flags; err = update_effective_progs(cgrp, type); @@ -401,7 +385,7 @@ cleanup: pl->storage[stype] = old_storage[stype]; bpf_cgroup_storage_link(old_storage[stype], cgrp, type); } - if (pl_was_allocated) { + if (!replace_pl) { list_del(&pl->node); kfree(pl); } From 9fab329d6a04c0a52a84d207b5e0d83aeb660aa0 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:34 -0800 Subject: [PATCH 100/127] bpf: Remove unused new_flags in hierarchy_allows_attach() new_flags is unused, remove it. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/2c49b30ab750f93cfef04a1e40b097d70c3a39a1.1576741281.git.rdna@fb.com --- kernel/bpf/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e8cbdd1be687..283efe3ce052 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -103,8 +103,7 @@ static u32 prog_list_length(struct list_head *head) * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, - enum bpf_attach_type type, - u32 new_flags) + enum bpf_attach_type type) { struct cgroup *p; @@ -303,7 +302,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, /* invalid combination */ return -EINVAL; - if (!hierarchy_allows_attach(cgrp, type, flags)) + if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) From 7dd68b3279f1792103d12e69933db3128c6d416e Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:35 -0800 Subject: [PATCH 101/127] bpf: Support replacing cgroup-bpf program in MULTI mode The common use-case in production is to have multiple cgroup-bpf programs per attach type that cover multiple use-cases. Such programs are attached with BPF_F_ALLOW_MULTI and can be maintained by different people. Order of programs usually matters, for example imagine two egress programs: the first one drops packets and the second one counts packets. If they're swapped the result of counting program will be different. It brings operational challenges with updating cgroup-bpf program(s) attached with BPF_F_ALLOW_MULTI since there is no way to replace a program: * One way to update is to detach all programs first and then attach the new version(s) again in the right order. This introduces an interruption in the work a program is doing and may not be acceptable (e.g. if it's egress firewall); * Another way is attach the new version of a program first and only then detach the old version. This introduces the time interval when two versions of same program are working, what may not be acceptable if a program is not idempotent. It also imposes additional burden on program developers to make sure that two versions of their program can co-exist. Solve the problem by introducing a "replace" mode in BPF_PROG_ATTACH command for cgroup-bpf programs being attached with BPF_F_ALLOW_MULTI flag. This mode is enabled by newly introduced BPF_F_REPLACE attach flag and bpf_attr.replace_bpf_fd attribute to pass fd of the old program to replace That way user can replace any program among those attached with BPF_F_ALLOW_MULTI flag without the problems described above. Details of the new API: * If BPF_F_REPLACE is set but replace_bpf_fd doesn't have valid descriptor of BPF program, BPF_PROG_ATTACH will return corresponding error (EINVAL or EBADF). * If replace_bpf_fd has valid descriptor of BPF program but such a program is not attached to specified cgroup, BPF_PROG_ATTACH will return ENOENT. BPF_F_REPLACE is introduced to make the user intent clear, since replace_bpf_fd alone can't be used for this (its default value, 0, is a valid fd). BPF_F_REPLACE also makes it possible to extend the API in the future (e.g. add BPF_F_BEFORE and BPF_F_AFTER if needed). Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Narkyiko Link: https://lore.kernel.org/bpf/30cd850044a0057bdfcaaf154b7d2f39850ba813.1576741281.git.rdna@fb.com --- include/linux/bpf-cgroup.h | 4 +++- include/uapi/linux/bpf.h | 10 ++++++++++ kernel/bpf/cgroup.c | 30 ++++++++++++++++++++++++++---- kernel/bpf/syscall.c | 4 ++-- kernel/cgroup/cgroup.c | 5 +++-- tools/include/uapi/linux/bpf.h | 10 ++++++++++ 6 files changed, 54 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 169fd25f6bc2..18f6a6da7c3c 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -85,6 +85,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp); void cgroup_bpf_offline(struct cgroup *cgrp); int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_prog *replace_prog, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type); @@ -93,7 +94,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags); + struct bpf_prog *replace_prog, enum bpf_attach_type type, + u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dbbcf0b02970..7df436da542d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -231,6 +231,11 @@ enum bpf_attach_type { * When children program makes decision (like picking TCP CA or sock bind) * parent program has a chance to override it. * + * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of + * programs for a cgroup. Though it's possible to replace an old program at + * any position by also specifying BPF_F_REPLACE flag and position itself in + * replace_bpf_fd attribute. Old program at this position will be released. + * * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. * A cgroup with NONE doesn't allow any programs in sub-cgroups. * Ex1: @@ -249,6 +254,7 @@ enum bpf_attach_type { */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) #define BPF_F_ALLOW_MULTI (1U << 1) +#define BPF_F_REPLACE (1U << 2) /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will perform strict alignment checking as if the kernel @@ -442,6 +448,10 @@ union bpf_attr { __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; + __u32 replace_bpf_fd; /* previously attached eBPF + * program to replace if + * BPF_F_REPLACE is used + */ }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 283efe3ce052..45346c79613a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -282,14 +282,17 @@ cleanup: * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach + * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * * Must be called with cgroup_mutex held. */ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_prog *replace_prog, enum bpf_attach_type type, u32 flags) { + u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], @@ -298,14 +301,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_cgroup_storage_type stype; int err; - if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || + ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -320,7 +324,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (pl->prog == prog) /* disallow attaching the same prog twice */ return -EINVAL; + if (pl->prog == replace_prog) + replace_pl = pl; } + if ((flags & BPF_F_REPLACE) && !replace_pl) + /* prog to replace not found for cgroup */ + return -ENOENT; } else if (!list_empty(progs)) { replace_pl = list_first_entry(progs, typeof(*pl), node); } @@ -356,7 +365,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, for_each_cgroup_storage_type(stype) pl->storage[stype] = storage[stype]; - cgrp->bpf.flags[type] = flags; + cgrp->bpf.flags[type] = saved_flags; err = update_effective_progs(cgrp, type); if (err) @@ -522,6 +531,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { + struct bpf_prog *replace_prog = NULL; struct cgroup *cgrp; int ret; @@ -529,8 +539,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + if ((attr->attach_flags & BPF_F_ALLOW_MULTI) && + (attr->attach_flags & BPF_F_REPLACE)) { + replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype); + if (IS_ERR(replace_prog)) { + cgroup_put(cgrp); + return PTR_ERR(replace_prog); + } + } + + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, attr->attach_flags); + + if (replace_prog) + bpf_prog_put(replace_prog); cgroup_put(cgrp); return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b08c362f4e02..81ee8595dfee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2073,10 +2073,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } -#define BPF_PROG_ATTACH_LAST_FIELD attach_flags +#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd #define BPF_F_ATTACH_MASK \ - (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) static int bpf_prog_attach(const union bpf_attr *attr) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 735af8f15f95..725365df066d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6288,12 +6288,13 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #ifdef CONFIG_CGROUP_BPF int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags) + struct bpf_prog *replace_prog, enum bpf_attach_type type, + u32 flags) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags); mutex_unlock(&cgroup_mutex); return ret; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index dbbcf0b02970..7df436da542d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -231,6 +231,11 @@ enum bpf_attach_type { * When children program makes decision (like picking TCP CA or sock bind) * parent program has a chance to override it. * + * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of + * programs for a cgroup. Though it's possible to replace an old program at + * any position by also specifying BPF_F_REPLACE flag and position itself in + * replace_bpf_fd attribute. Old program at this position will be released. + * * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. * A cgroup with NONE doesn't allow any programs in sub-cgroups. * Ex1: @@ -249,6 +254,7 @@ enum bpf_attach_type { */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) #define BPF_F_ALLOW_MULTI (1U << 1) +#define BPF_F_REPLACE (1U << 2) /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will perform strict alignment checking as if the kernel @@ -442,6 +448,10 @@ union bpf_attr { __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; + __u32 replace_bpf_fd; /* previously attached eBPF + * program to replace if + * BPF_F_REPLACE is used + */ }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ From cdbee3839cd91a4577425c43cf064b9523926159 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:36 -0800 Subject: [PATCH 102/127] libbpf: Introduce bpf_prog_attach_xattr Introduce a new bpf_prog_attach_xattr function that, in addition to program fd, target fd and attach type, accepts an extendable struct bpf_prog_attach_opts. bpf_prog_attach_opts relies on DECLARE_LIBBPF_OPTS macro to maintain backward and forward compatibility and has the following "optional" attach attributes: * existing attach_flags, since it's not required when attaching in NONE mode. Even though it's quite often used in MULTI and OVERRIDE mode it seems to be a good idea to reduce number of arguments to bpf_prog_attach_xattr; * newly introduced attribute of BPF_PROG_ATTACH command: replace_prog_fd that is fd of previously attached cgroup-bpf program to replace if BPF_F_REPLACE flag is used. The new function is named to be consistent with other xattr-functions (bpf_prog_test_run_xattr, bpf_create_map_xattr, bpf_load_program_xattr). The struct bpf_prog_attach_opts is supposed to be used with DECLARE_LIBBPF_OPTS macro. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/bd6e0732303eb14e4b79cb128268d9e9ad6db208.1576741281.git.rdna@fb.com --- tools/lib/bpf/bpf.c | 17 ++++++++++++++++- tools/lib/bpf/bpf.h | 11 +++++++++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 98596e15390f..a787d53699c8 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -466,14 +466,29 @@ int bpf_obj_get(const char *pathname) int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, unsigned int flags) +{ + DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, opts, + .flags = flags, + ); + + return bpf_prog_attach_xattr(prog_fd, target_fd, type, &opts); +} + +int bpf_prog_attach_xattr(int prog_fd, int target_fd, + enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts) { union bpf_attr attr; + if (!OPTS_VALID(opts, bpf_prog_attach_opts)) + return -EINVAL; + memset(&attr, 0, sizeof(attr)); attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; - attr.attach_flags = flags; + attr.attach_flags = OPTS_GET(opts, flags, 0); + attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0); return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); } diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 269807ce9ef5..f0ab8519986e 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -126,8 +126,19 @@ LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key); LIBBPF_API int bpf_map_freeze(int fd); LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); LIBBPF_API int bpf_obj_get(const char *pathname); + +struct bpf_prog_attach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + unsigned int flags; + int replace_prog_fd; +}; +#define bpf_prog_attach_opts__last_field replace_prog_fd + LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, unsigned int flags); +LIBBPF_API int bpf_prog_attach_xattr(int prog_fd, int attachable_fd, + enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts); LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index e3a471f38a71..e9713a574243 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -219,6 +219,7 @@ LIBBPF_0.0.7 { bpf_object__detach_skeleton; bpf_object__load_skeleton; bpf_object__open_skeleton; + bpf_prog_attach_xattr; bpf_program__attach; bpf_program__name; btf__align_of; From 257c88559f360ef40d251942f1f9f0c55f5f91ca Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:37 -0800 Subject: [PATCH 103/127] selftests/bpf: Convert test_cgroup_attach to prog_tests Convert test_cgroup_attach to prog_tests. This change does a lot of things but in many cases it's pretty expensive to separate them, so they go in one commit. Nevertheless the logic is ketp as is and changes made are just moving things around, simplifying them (w/o changing the meaning of the tests) and making prog_tests compatible: * split the 3 tests in the file into 3 separate files in prog_tests/; * rename the test functions to test_; * remove unused includes, constants, variables and functions from every test; * replace `if`-s with or `if (CHECK())` where additional context should be logged and with `if (CHECK_FAIL())` where line number is enough; * switch from `log_err()` to logging via `CHECK()`; * replace `assert`-s with `CHECK_FAIL()` to avoid crashing the whole test_progs if one assertion fails; * replace cgroup_helpers with test__join_cgroup() in cgroup_attach_override only, other tests need more fine-grained control for cgroup creation/deletion so cgroup_helpers are still used there; * simplify cgroup_attach_autodetach by switching to easiest possible program since this test doesn't really need such a complicated program as cgroup_attach_multi does; * remove test_cgroup_attach.c itself. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/0ff19cc64d2dc5cf404349f07131119480e10e32.1576741281.git.rdna@fb.com --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 3 +- .../bpf/prog_tests/cgroup_attach_autodetach.c | 111 ++++ .../bpf/prog_tests/cgroup_attach_multi.c | 238 ++++++++ .../bpf/prog_tests/cgroup_attach_override.c | 148 +++++ .../selftests/bpf/test_cgroup_attach.c | 571 ------------------ 6 files changed, 498 insertions(+), 574 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c delete mode 100644 tools/testing/selftests/bpf/test_cgroup_attach.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index ce5af95ede42..b139b3d75ebb 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -21,7 +21,6 @@ test_lirc_mode2_user get_cgroup_id_user test_skb_cgroup_id_user test_socket_cookie -test_cgroup_attach test_cgroup_storage test_select_reuseport test_flow_dissector diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c652bd84ef0e..866fc1cadd7c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -32,7 +32,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \ test_cgroup_storage \ test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \ - test_cgroup_attach test_progs-no_alu32 + test_progs-no_alu32 # Also test bpf-gcc, if present ifneq ($(BPF_GCC),) @@ -136,7 +136,6 @@ $(OUTPUT)/test_cgroup_storage: cgroup_helpers.c $(OUTPUT)/test_netcnt: cgroup_helpers.c $(OUTPUT)/test_sock_fields: cgroup_helpers.c $(OUTPUT)/test_sysctl: cgroup_helpers.c -$(OUTPUT)/test_cgroup_attach: cgroup_helpers.c .PHONY: force diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c new file mode 100644 index 000000000000..5b13f2c6c402 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "cgroup_helpers.h" + +#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" + +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + +static int prog_load(void) +{ + struct bpf_insn prog[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = 1 */ + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + + return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, + prog, insns_cnt, "GPL", 0, + bpf_log_buf, BPF_LOG_BUF_SIZE); +} + +void test_cgroup_attach_autodetach(void) +{ + __u32 duration = 0, prog_cnt = 4, attach_flags; + int allow_prog[2] = {-1}; + __u32 prog_ids[2] = {0}; + void *ptr = NULL; + int cg = 0, i; + int attempts; + + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { + allow_prog[i] = prog_load(); + if (CHECK(allow_prog[i] < 0, "prog_load", + "verifier output:\n%s\n-------\n", bpf_log_buf)) + goto err; + } + + if (CHECK_FAIL(setup_cgroup_environment())) + goto err; + + /* create a cgroup, attach two programs and remember their ids */ + cg = create_and_get_cgroup("/cg_autodetach"); + if (CHECK_FAIL(cg < 0)) + goto err; + + if (CHECK_FAIL(join_cgroup("/cg_autodetach"))) + goto err; + + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) + if (CHECK(bpf_prog_attach(allow_prog[i], cg, + BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI), + "prog_attach", "prog[%d], errno=%d\n", i, errno)) + goto err; + + /* make sure that programs are attached and run some traffic */ + if (CHECK(bpf_prog_query(cg, BPF_CGROUP_INET_EGRESS, 0, &attach_flags, + prog_ids, &prog_cnt), + "prog_query", "errno=%d\n", errno)) + goto err; + if (CHECK_FAIL(system(PING_CMD))) + goto err; + + /* allocate some memory (4Mb) to pin the original cgroup */ + ptr = malloc(4 * (1 << 20)); + if (CHECK_FAIL(!ptr)) + goto err; + + /* close programs and cgroup fd */ + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { + close(allow_prog[i]); + allow_prog[i] = -1; + } + + close(cg); + cg = 0; + + /* leave the cgroup and remove it. don't detach programs */ + cleanup_cgroup_environment(); + + /* wait for the asynchronous auto-detachment. + * wait for no more than 5 sec and give up. + */ + for (i = 0; i < ARRAY_SIZE(prog_ids); i++) { + for (attempts = 5; attempts >= 0; attempts--) { + int fd = bpf_prog_get_fd_by_id(prog_ids[i]); + + if (fd < 0) + break; + + /* don't leave the fd open */ + close(fd); + + if (CHECK_FAIL(!attempts)) + goto err; + + sleep(1); + } + } + +err: + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) + if (allow_prog[i] >= 0) + close(allow_prog[i]); + if (cg) + close(cg); + free(ptr); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c new file mode 100644 index 000000000000..4eaab7435044 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "cgroup_helpers.h" + +#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" + +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + +static int map_fd = -1; + +static int prog_load_cnt(int verdict, int val) +{ + int cgroup_storage_fd, percpu_cgroup_storage_fd; + + if (map_fd < 0) + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + return -1; + } + + cgroup_storage_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, + sizeof(struct bpf_cgroup_storage_key), 8, 0, 0); + if (cgroup_storage_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + return -1; + } + + percpu_cgroup_storage_fd = bpf_create_map( + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + sizeof(struct bpf_cgroup_storage_key), 8, 0, 0); + if (percpu_cgroup_storage_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + return -1; + } + + struct bpf_insn prog[] = { + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), + BPF_MOV64_IMM(BPF_REG_1, val), + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0), + + BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1), + BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0), + + BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + int ret; + + ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, + prog, insns_cnt, "GPL", 0, + bpf_log_buf, BPF_LOG_BUF_SIZE); + + close(cgroup_storage_fd); + return ret; +} + +void test_cgroup_attach_multi(void) +{ + __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id; + int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0; + int allow_prog[6] = {-1}; + unsigned long long value; + __u32 duration = 0; + int i = 0; + + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { + allow_prog[i] = prog_load_cnt(1, 1 << i); + if (CHECK(allow_prog[i] < 0, "prog_load", + "verifier output:\n%s\n-------\n", bpf_log_buf)) + goto err; + } + + if (CHECK_FAIL(setup_cgroup_environment())) + goto err; + + cg1 = create_and_get_cgroup("/cg1"); + if (CHECK_FAIL(cg1 < 0)) + goto err; + cg2 = create_and_get_cgroup("/cg1/cg2"); + if (CHECK_FAIL(cg2 < 0)) + goto err; + cg3 = create_and_get_cgroup("/cg1/cg2/cg3"); + if (CHECK_FAIL(cg3 < 0)) + goto err; + cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4"); + if (CHECK_FAIL(cg4 < 0)) + goto err; + cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5"); + if (CHECK_FAIL(cg5 < 0)) + goto err; + + if (CHECK_FAIL(join_cgroup("/cg1/cg2/cg3/cg4/cg5"))) + goto err; + + if (CHECK(bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI), + "prog0_attach_to_cg1_multi", "errno=%d\n", errno)) + goto err; + + if (CHECK(!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI), + "fail_same_prog_attach_to_cg1", "unexpected success\n")) + goto err; + + if (CHECK(bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI), + "prog1_attach_to_cg1_multi", "errno=%d\n", errno)) + goto err; + + if (CHECK(bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE), + "prog2_attach_to_cg2_override", "errno=%d\n", errno)) + goto err; + + if (CHECK(bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI), + "prog3_attach_to_cg3_multi", "errno=%d\n", errno)) + goto err; + + if (CHECK(bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE), + "prog4_attach_to_cg4_override", "errno=%d\n", errno)) + goto err; + + if (CHECK(bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0), + "prog5_attach_to_cg5_none", "errno=%d\n", errno)) + goto err; + + CHECK_FAIL(system(PING_CMD)); + CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value)); + CHECK_FAIL(value != 1 + 2 + 8 + 32); + + /* query the number of effective progs in cg5 */ + CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, + BPF_F_QUERY_EFFECTIVE, NULL, NULL, &prog_cnt)); + CHECK_FAIL(prog_cnt != 4); + /* retrieve prog_ids of effective progs in cg5 */ + CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, + BPF_F_QUERY_EFFECTIVE, &attach_flags, + prog_ids, &prog_cnt)); + CHECK_FAIL(prog_cnt != 4); + CHECK_FAIL(attach_flags != 0); + saved_prog_id = prog_ids[0]; + /* check enospc handling */ + prog_ids[0] = 0; + prog_cnt = 2; + CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, + BPF_F_QUERY_EFFECTIVE, &attach_flags, + prog_ids, &prog_cnt) != -1); + CHECK_FAIL(errno != ENOSPC); + CHECK_FAIL(prog_cnt != 4); + /* check that prog_ids are returned even when buffer is too small */ + CHECK_FAIL(prog_ids[0] != saved_prog_id); + /* retrieve prog_id of single attached prog in cg5 */ + prog_ids[0] = 0; + CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, NULL, + prog_ids, &prog_cnt)); + CHECK_FAIL(prog_cnt != 1); + CHECK_FAIL(prog_ids[0] != saved_prog_id); + + /* detach bottom program and ping again */ + if (CHECK(bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS), + "prog_detach_from_cg5", "errno=%d\n", errno)) + goto err; + + value = 0; + CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0)); + CHECK_FAIL(system(PING_CMD)); + CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value)); + CHECK_FAIL(value != 1 + 2 + 8 + 16); + + /* detach 3rd from bottom program and ping again */ + if (CHECK(!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS), + "fail_prog_detach_from_cg3", "unexpected success\n")) + goto err; + + if (CHECK(bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS), + "prog3_detach_from_cg3", "errno=%d\n", errno)) + goto err; + + value = 0; + CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0)); + CHECK_FAIL(system(PING_CMD)); + CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value)); + CHECK_FAIL(value != 1 + 2 + 16); + + /* detach 2nd from bottom program and ping again */ + if (CHECK(bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS), + "prog_detach_from_cg4", "errno=%d\n", errno)) + goto err; + + value = 0; + CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0)); + CHECK_FAIL(system(PING_CMD)); + CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value)); + CHECK_FAIL(value != 1 + 2 + 4); + + prog_cnt = 4; + CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, + BPF_F_QUERY_EFFECTIVE, &attach_flags, + prog_ids, &prog_cnt)); + CHECK_FAIL(prog_cnt != 3); + CHECK_FAIL(attach_flags != 0); + CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, NULL, + prog_ids, &prog_cnt)); + CHECK_FAIL(prog_cnt != 0); + +err: + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) + if (allow_prog[i] >= 0) + close(allow_prog[i]); + close(cg1); + close(cg2); + close(cg3); + close(cg4); + close(cg5); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c new file mode 100644 index 000000000000..9d8cb48b99de --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "cgroup_helpers.h" + +#define FOO "/foo" +#define BAR "/foo/bar/" +#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" + +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + +static int prog_load(int verdict) +{ + struct bpf_insn prog[] = { + BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + + return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, + prog, insns_cnt, "GPL", 0, + bpf_log_buf, BPF_LOG_BUF_SIZE); +} + +void test_cgroup_attach_override(void) +{ + int drop_prog = -1, allow_prog = -1, foo = -1, bar = -1; + __u32 duration = 0; + + allow_prog = prog_load(1); + if (CHECK(allow_prog < 0, "prog_load_allow", + "verifier output:\n%s\n-------\n", bpf_log_buf)) + goto err; + + drop_prog = prog_load(0); + if (CHECK(drop_prog < 0, "prog_load_drop", + "verifier output:\n%s\n-------\n", bpf_log_buf)) + goto err; + + foo = test__join_cgroup(FOO); + if (CHECK(foo < 0, "cgroup_join_foo", "cgroup setup failed\n")) + goto err; + + if (CHECK(bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE), + "prog_attach_drop_foo_override", + "attach prog to %s failed, errno=%d\n", FOO, errno)) + goto err; + + if (CHECK(!system(PING_CMD), "ping_fail", + "ping unexpectedly succeeded\n")) + goto err; + + bar = test__join_cgroup(BAR); + if (CHECK(bar < 0, "cgroup_join_bar", "cgroup setup failed\n")) + goto err; + + if (CHECK(!system(PING_CMD), "ping_fail", + "ping unexpectedly succeeded\n")) + goto err; + + if (CHECK(bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE), + "prog_attach_allow_bar_override", + "attach prog to %s failed, errno=%d\n", BAR, errno)) + goto err; + + if (CHECK(system(PING_CMD), "ping_ok", "ping failed\n")) + goto err; + + if (CHECK(bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS), + "prog_detach_bar", + "detach prog from %s failed, errno=%d\n", BAR, errno)) + goto err; + + if (CHECK(!system(PING_CMD), "ping_fail", + "ping unexpectedly succeeded\n")) + goto err; + + if (CHECK(bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE), + "prog_attach_allow_bar_override", + "attach prog to %s failed, errno=%d\n", BAR, errno)) + goto err; + + if (CHECK(bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS), + "prog_detach_foo", + "detach prog from %s failed, errno=%d\n", FOO, errno)) + goto err; + + if (CHECK(system(PING_CMD), "ping_ok", "ping failed\n")) + goto err; + + if (CHECK(bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE), + "prog_attach_allow_bar_override", + "attach prog to %s failed, errno=%d\n", BAR, errno)) + goto err; + + if (CHECK(!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0), + "fail_prog_attach_allow_bar_none", + "attach prog to %s unexpectedly succeeded\n", BAR)) + goto err; + + if (CHECK(bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS), + "prog_detach_bar", + "detach prog from %s failed, errno=%d\n", BAR, errno)) + goto err; + + if (CHECK(!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS), + "fail_prog_detach_foo", + "double detach from %s unexpectedly succeeded\n", FOO)) + goto err; + + if (CHECK(bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0), + "prog_attach_allow_foo_none", + "attach prog to %s failed, errno=%d\n", FOO, errno)) + goto err; + + if (CHECK(!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0), + "fail_prog_attach_allow_bar_none", + "attach prog to %s unexpectedly succeeded\n", BAR)) + goto err; + + if (CHECK(!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE), + "fail_prog_attach_allow_bar_override", + "attach prog to %s unexpectedly succeeded\n", BAR)) + goto err; + + if (CHECK(!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE), + "fail_prog_attach_allow_foo_override", + "attach prog to %s unexpectedly succeeded\n", FOO)) + goto err; + + if (CHECK(bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0), + "prog_attach_drop_foo_none", + "attach prog to %s failed, errno=%d\n", FOO, errno)) + goto err; + +err: + close(foo); + close(bar); + close(allow_prog); + close(drop_prog); +} diff --git a/tools/testing/selftests/bpf/test_cgroup_attach.c b/tools/testing/selftests/bpf/test_cgroup_attach.c deleted file mode 100644 index 7671909ee1cb..000000000000 --- a/tools/testing/selftests/bpf/test_cgroup_attach.c +++ /dev/null @@ -1,571 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* eBPF example program: - * - * - Creates arraymap in kernel with 4 bytes keys and 8 byte values - * - * - Loads eBPF program - * - * The eBPF program accesses the map passed in to store two pieces of - * information. The number of invocations of the program, which maps - * to the number of packets received, is stored to key 0. Key 1 is - * incremented on each iteration by the number of bytes stored in - * the skb. The program also stores the number of received bytes - * in the cgroup storage. - * - * - Attaches the new program to a cgroup using BPF_PROG_ATTACH - * - * - Every second, reads map[0] and map[1] to see how many bytes and - * packets were seen on any socket of tasks in the given cgroup. - */ - -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "bpf_util.h" -#include "bpf_rlimit.h" -#include "cgroup_helpers.h" - -#define FOO "/foo" -#define BAR "/foo/bar/" -#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -#ifdef DEBUG -#define debug(args...) printf(args) -#else -#define debug(args...) -#endif - -static int prog_load(int verdict) -{ - int ret; - struct bpf_insn prog[] = { - BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ - BPF_EXIT_INSN(), - }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - - ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, - prog, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); - - if (ret < 0) { - log_err("Loading program"); - printf("Output from verifier:\n%s\n-------\n", bpf_log_buf); - return 0; - } - return ret; -} - -static int test_foo_bar(void) -{ - int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0; - - allow_prog = prog_load(1); - if (!allow_prog) - goto err; - - drop_prog = prog_load(0); - if (!drop_prog) - goto err; - - if (setup_cgroup_environment()) - goto err; - - /* Create cgroup /foo, get fd, and join it */ - foo = create_and_get_cgroup(FOO); - if (foo < 0) - goto err; - - if (join_cgroup(FOO)) - goto err; - - if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to /foo"); - goto err; - } - - debug("Attached DROP prog. This ping in cgroup /foo should fail...\n"); - assert(system(PING_CMD) != 0); - - /* Create cgroup /foo/bar, get fd, and join it */ - bar = create_and_get_cgroup(BAR); - if (bar < 0) - goto err; - - if (join_cgroup(BAR)) - goto err; - - debug("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); - assert(system(PING_CMD) != 0); - - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to /foo/bar"); - goto err; - } - - debug("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n"); - assert(system(PING_CMD) == 0); - - if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching program from /foo/bar"); - goto err; - } - - debug("Detached PASS from /foo/bar while DROP is attached to /foo.\n" - "This ping in cgroup /foo/bar should fail...\n"); - assert(system(PING_CMD) != 0); - - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to /foo/bar"); - goto err; - } - - if (bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching program from /foo"); - goto err; - } - - debug("Attached PASS from /foo/bar and detached DROP from /foo.\n" - "This ping in cgroup /foo/bar should pass...\n"); - assert(system(PING_CMD) == 0); - - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to /foo/bar"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) { - errno = 0; - log_err("Unexpected success attaching prog to /foo/bar"); - goto err; - } - - if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching program from /foo/bar"); - goto err; - } - - if (!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) { - errno = 0; - log_err("Unexpected success in double detach from /foo"); - goto err; - } - - if (bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) { - log_err("Attaching non-overridable prog to /foo"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) { - errno = 0; - log_err("Unexpected success attaching non-overridable prog to /foo/bar"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - errno = 0; - log_err("Unexpected success attaching overridable prog to /foo/bar"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - errno = 0; - log_err("Unexpected success attaching overridable prog to /foo"); - goto err; - } - - if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) { - log_err("Attaching different non-overridable prog to /foo"); - goto err; - } - - goto out; - -err: - rc = 1; - -out: - close(foo); - close(bar); - cleanup_cgroup_environment(); - if (!rc) - printf("#override:PASS\n"); - else - printf("#override:FAIL\n"); - return rc; -} - -static int map_fd = -1; - -static int prog_load_cnt(int verdict, int val) -{ - int cgroup_storage_fd, percpu_cgroup_storage_fd; - - if (map_fd < 0) - map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); - if (map_fd < 0) { - printf("failed to create map '%s'\n", strerror(errno)); - return -1; - } - - cgroup_storage_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, - sizeof(struct bpf_cgroup_storage_key), 8, 0, 0); - if (cgroup_storage_fd < 0) { - printf("failed to create map '%s'\n", strerror(errno)); - return -1; - } - - percpu_cgroup_storage_fd = bpf_create_map( - BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, - sizeof(struct bpf_cgroup_storage_key), 8, 0, 0); - if (percpu_cgroup_storage_fd < 0) { - printf("failed to create map '%s'\n", strerror(errno)); - return -1; - } - - struct bpf_insn prog[] = { - BPF_MOV32_IMM(BPF_REG_0, 0), - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ - BPF_LD_MAP_FD(BPF_REG_1, map_fd), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), - BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ - - BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd), - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), - BPF_MOV64_IMM(BPF_REG_1, val), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0), - - BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd), - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), - BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1), - BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0), - - BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ - BPF_EXIT_INSN(), - }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - int ret; - - ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, - prog, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); - - if (ret < 0) { - log_err("Loading program"); - printf("Output from verifier:\n%s\n-------\n", bpf_log_buf); - return 0; - } - close(cgroup_storage_fd); - return ret; -} - - -static int test_multiprog(void) -{ - __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id; - int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0; - int drop_prog, allow_prog[6] = {}, rc = 0; - unsigned long long value; - int i = 0; - - for (i = 0; i < 6; i++) { - allow_prog[i] = prog_load_cnt(1, 1 << i); - if (!allow_prog[i]) - goto err; - } - drop_prog = prog_load_cnt(0, 1); - if (!drop_prog) - goto err; - - if (setup_cgroup_environment()) - goto err; - - cg1 = create_and_get_cgroup("/cg1"); - if (cg1 < 0) - goto err; - cg2 = create_and_get_cgroup("/cg1/cg2"); - if (cg2 < 0) - goto err; - cg3 = create_and_get_cgroup("/cg1/cg2/cg3"); - if (cg3 < 0) - goto err; - cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4"); - if (cg4 < 0) - goto err; - cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5"); - if (cg5 < 0) - goto err; - - if (join_cgroup("/cg1/cg2/cg3/cg4/cg5")) - goto err; - - if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_MULTI)) { - log_err("Attaching prog to cg1"); - goto err; - } - if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_MULTI)) { - log_err("Unexpected success attaching the same prog to cg1"); - goto err; - } - if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_MULTI)) { - log_err("Attaching prog2 to cg1"); - goto err; - } - if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to cg2"); - goto err; - } - if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_MULTI)) { - log_err("Attaching prog to cg3"); - goto err; - } - if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to cg4"); - goto err; - } - if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) { - log_err("Attaching prog to cg5"); - goto err; - } - assert(system(PING_CMD) == 0); - assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); - assert(value == 1 + 2 + 8 + 32); - - /* query the number of effective progs in cg5 */ - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, - NULL, NULL, &prog_cnt) == 0); - assert(prog_cnt == 4); - /* retrieve prog_ids of effective progs in cg5 */ - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, - &attach_flags, prog_ids, &prog_cnt) == 0); - assert(prog_cnt == 4); - assert(attach_flags == 0); - saved_prog_id = prog_ids[0]; - /* check enospc handling */ - prog_ids[0] = 0; - prog_cnt = 2; - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, - &attach_flags, prog_ids, &prog_cnt) == -1 && - errno == ENOSPC); - assert(prog_cnt == 4); - /* check that prog_ids are returned even when buffer is too small */ - assert(prog_ids[0] == saved_prog_id); - /* retrieve prog_id of single attached prog in cg5 */ - prog_ids[0] = 0; - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, - NULL, prog_ids, &prog_cnt) == 0); - assert(prog_cnt == 1); - assert(prog_ids[0] == saved_prog_id); - - /* detach bottom program and ping again */ - if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching prog from cg5"); - goto err; - } - value = 0; - assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); - assert(system(PING_CMD) == 0); - assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); - assert(value == 1 + 2 + 8 + 16); - - /* detach 3rd from bottom program and ping again */ - errno = 0; - if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) { - log_err("Unexpected success on detach from cg3"); - goto err; - } - if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching from cg3"); - goto err; - } - value = 0; - assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); - assert(system(PING_CMD) == 0); - assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); - assert(value == 1 + 2 + 16); - - /* detach 2nd from bottom program and ping again */ - if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching prog from cg4"); - goto err; - } - value = 0; - assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); - assert(system(PING_CMD) == 0); - assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); - assert(value == 1 + 2 + 4); - - prog_cnt = 4; - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, - &attach_flags, prog_ids, &prog_cnt) == 0); - assert(prog_cnt == 3); - assert(attach_flags == 0); - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, - NULL, prog_ids, &prog_cnt) == 0); - assert(prog_cnt == 0); - goto out; -err: - rc = 1; - -out: - for (i = 0; i < 6; i++) - if (allow_prog[i] > 0) - close(allow_prog[i]); - close(cg1); - close(cg2); - close(cg3); - close(cg4); - close(cg5); - cleanup_cgroup_environment(); - if (!rc) - printf("#multi:PASS\n"); - else - printf("#multi:FAIL\n"); - return rc; -} - -static int test_autodetach(void) -{ - __u32 prog_cnt = 4, attach_flags; - int allow_prog[2] = {0}; - __u32 prog_ids[2] = {0}; - int cg = 0, i, rc = -1; - void *ptr = NULL; - int attempts; - - for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { - allow_prog[i] = prog_load_cnt(1, 1 << i); - if (!allow_prog[i]) - goto err; - } - - if (setup_cgroup_environment()) - goto err; - - /* create a cgroup, attach two programs and remember their ids */ - cg = create_and_get_cgroup("/cg_autodetach"); - if (cg < 0) - goto err; - - if (join_cgroup("/cg_autodetach")) - goto err; - - for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { - if (bpf_prog_attach(allow_prog[i], cg, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_MULTI)) { - log_err("Attaching prog[%d] to cg:egress", i); - goto err; - } - } - - /* make sure that programs are attached and run some traffic */ - assert(bpf_prog_query(cg, BPF_CGROUP_INET_EGRESS, 0, &attach_flags, - prog_ids, &prog_cnt) == 0); - assert(system(PING_CMD) == 0); - - /* allocate some memory (4Mb) to pin the original cgroup */ - ptr = malloc(4 * (1 << 20)); - if (!ptr) - goto err; - - /* close programs and cgroup fd */ - for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { - close(allow_prog[i]); - allow_prog[i] = 0; - } - - close(cg); - cg = 0; - - /* leave the cgroup and remove it. don't detach programs */ - cleanup_cgroup_environment(); - - /* wait for the asynchronous auto-detachment. - * wait for no more than 5 sec and give up. - */ - for (i = 0; i < ARRAY_SIZE(prog_ids); i++) { - for (attempts = 5; attempts >= 0; attempts--) { - int fd = bpf_prog_get_fd_by_id(prog_ids[i]); - - if (fd < 0) - break; - - /* don't leave the fd open */ - close(fd); - - if (!attempts) - goto err; - - sleep(1); - } - } - - rc = 0; -err: - for (i = 0; i < ARRAY_SIZE(allow_prog); i++) - if (allow_prog[i] > 0) - close(allow_prog[i]); - if (cg) - close(cg); - free(ptr); - cleanup_cgroup_environment(); - if (!rc) - printf("#autodetach:PASS\n"); - else - printf("#autodetach:FAIL\n"); - return rc; -} - -int main(void) -{ - int (*tests[])(void) = { - test_foo_bar, - test_multiprog, - test_autodetach, - }; - int errors = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(tests); i++) - if (tests[i]()) - errors++; - - if (errors) - printf("test_cgroup_attach:FAIL\n"); - else - printf("test_cgroup_attach:PASS\n"); - - return errors ? EXIT_FAILURE : EXIT_SUCCESS; -} From 06ac0186bd242d44f5703ce64ee0077b317c86bb Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:38 -0800 Subject: [PATCH 104/127] selftests/bpf: Test BPF_F_REPLACE in cgroup_attach_multi Test replacing a cgroup-bpf program attached with BPF_F_ALLOW_MULTI and possible failure modes: invalid combination of flags, invalid replace_bpf_fd, replacing a non-attachd to specified cgroup program. Example of program replacing: # gdb -q --args ./test_progs --name=cgroup_attach_multi ... Breakpoint 1, test_cgroup_attach_multi () at cgroup_attach_multi.c:227 (gdb) [1]+ Stopped gdb -q --args ./test_progs --name=cgroup_attach_multi # bpftool c s /mnt/cgroup2/cgroup-test-work-dir/cg1 ID AttachType AttachFlags Name 2133 egress multi 2134 egress multi # fg gdb -q --args ./test_progs --name=cgroup_attach_multi (gdb) c Continuing. Breakpoint 2, test_cgroup_attach_multi () at cgroup_attach_multi.c:233 (gdb) [1]+ Stopped gdb -q --args ./test_progs --name=cgroup_attach_multi # bpftool c s /mnt/cgroup2/cgroup-test-work-dir/cg1 ID AttachType AttachFlags Name 2139 egress multi 2134 egress multi Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/7b9b83e8d5fb82e15b034341bd40b6fb2431eeba.1576741281.git.rdna@fb.com --- .../bpf/prog_tests/cgroup_attach_multi.c | 53 +++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c index 4eaab7435044..2ff21dbce179 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -78,7 +78,8 @@ void test_cgroup_attach_multi(void) { __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id; int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0; - int allow_prog[6] = {-1}; + DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, attach_opts); + int allow_prog[7] = {-1}; unsigned long long value; __u32 duration = 0; int i = 0; @@ -189,6 +190,52 @@ void test_cgroup_attach_multi(void) CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value)); CHECK_FAIL(value != 1 + 2 + 8 + 16); + /* test replace */ + + attach_opts.flags = BPF_F_ALLOW_OVERRIDE | BPF_F_REPLACE; + attach_opts.replace_prog_fd = allow_prog[0]; + if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1, + BPF_CGROUP_INET_EGRESS, &attach_opts), + "fail_prog_replace_override", "unexpected success\n")) + goto err; + CHECK_FAIL(errno != EINVAL); + + attach_opts.flags = BPF_F_REPLACE; + if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1, + BPF_CGROUP_INET_EGRESS, &attach_opts), + "fail_prog_replace_no_multi", "unexpected success\n")) + goto err; + CHECK_FAIL(errno != EINVAL); + + attach_opts.flags = BPF_F_ALLOW_MULTI | BPF_F_REPLACE; + attach_opts.replace_prog_fd = -1; + if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1, + BPF_CGROUP_INET_EGRESS, &attach_opts), + "fail_prog_replace_bad_fd", "unexpected success\n")) + goto err; + CHECK_FAIL(errno != EBADF); + + /* replacing a program that is not attached to cgroup should fail */ + attach_opts.replace_prog_fd = allow_prog[3]; + if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1, + BPF_CGROUP_INET_EGRESS, &attach_opts), + "fail_prog_replace_no_ent", "unexpected success\n")) + goto err; + CHECK_FAIL(errno != ENOENT); + + /* replace 1st from the top program */ + attach_opts.replace_prog_fd = allow_prog[0]; + if (CHECK(bpf_prog_attach_xattr(allow_prog[6], cg1, + BPF_CGROUP_INET_EGRESS, &attach_opts), + "prog_replace", "errno=%d\n", errno)) + goto err; + + value = 0; + CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0)); + CHECK_FAIL(system(PING_CMD)); + CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value)); + CHECK_FAIL(value != 64 + 2 + 8 + 16); + /* detach 3rd from bottom program and ping again */ if (CHECK(!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS), "fail_prog_detach_from_cg3", "unexpected success\n")) @@ -202,7 +249,7 @@ void test_cgroup_attach_multi(void) CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0)); CHECK_FAIL(system(PING_CMD)); CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value)); - CHECK_FAIL(value != 1 + 2 + 16); + CHECK_FAIL(value != 64 + 2 + 16); /* detach 2nd from bottom program and ping again */ if (CHECK(bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS), @@ -213,7 +260,7 @@ void test_cgroup_attach_multi(void) CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0)); CHECK_FAIL(system(PING_CMD)); CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value)); - CHECK_FAIL(value != 1 + 2 + 4); + CHECK_FAIL(value != 64 + 2 + 4); prog_cnt = 4; CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, From 484b165306e18fd9a0d960e6ed2d6eff31665522 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:20 +0100 Subject: [PATCH 105/127] xsk: Eliminate the lazy update threshold The lazy update threshold was introduced to keep the producer and consumer some distance apart in the completion ring. This was important in the beginning of the development of AF_XDP as the ring format as that point in time was very sensitive to the producer and consumer being on the same cache line. This is not the case anymore as the current ring format does not degrade in any noticeable way when this happens. Moreover, this threshold makes it impossible to run the system with rings that have less than 128 entries. So let us remove this threshold and just get one entry from the ring as in all other functions. This will enable us to remove this function in a later commit. Note that xskq_produce_addr_lazy followed by xskq_produce_flush_addr_n are still not the same function as xskq_produce_addr() as it operates on another cached pointer. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-2-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index eddae4688862..a2f0ba65559f 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -11,7 +11,6 @@ #include #define RX_BATCH_SIZE 16 -#define LAZY_UPDATE_THRESHOLD 128 struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; @@ -239,7 +238,7 @@ static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0) + if (xskq_nb_free(q, q->prod_head, 1) == 0) return -ENOSPC; /* A, matches D */ From 11cc2d21499cabe7e7964389634ed1de3ee91d33 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:21 +0100 Subject: [PATCH 106/127] xsk: Simplify detection of empty and full rings In order to set the correct return flags for poll, the xsk code has to check if the Rx queue is empty and if the Tx queue is full. This code was unnecessarily large and complex as it used the functions that are used to update the local state from the global state (xskq_nb_free and xskq_nb_avail). Since we are not doing this nor updating any data dependent on this state, we can simplify the functions. Another benefit from this is that we can also simplify the xskq_nb_free and xskq_nb_avail functions in a later commit. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-3-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index a2f0ba65559f..b28f9da7b317 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -362,12 +362,15 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) static inline bool xskq_full_desc(struct xsk_queue *q) { - return xskq_nb_avail(q, q->nentries) == q->nentries; + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == + q->nentries; } static inline bool xskq_empty_desc(struct xsk_queue *q) { - return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries; + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); } void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); From d7012f05e3ca5aba92770c370895092d4882e8c2 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:22 +0100 Subject: [PATCH 107/127] xsk: Consolidate to one single cached producer pointer Currently, the xsk ring code has two cached producer pointers: prod_head and prod_tail. This patch consolidates these two into a single one called cached_prod to make the code simpler and easier to maintain. This will be in line with the user space part of the the code found in libbpf, that only uses a single cached pointer. The Rx path only uses the two top level functions xskq_produce_batch_desc and xskq_produce_flush_desc and they both use prod_head and never prod_tail. So just move them over to cached_prod. The Tx XDP_DRV path uses xskq_produce_addr_lazy and xskq_produce_flush_addr_n and unnecessarily operates on both prod_tail and prod_head, so move them over to just use cached_prod by skipping the intermediate step of updating prod_tail. The Tx path in XDP_SKB mode uses xskq_reserve_addr and xskq_produce_addr. They currently use both cached pointers, but we can operate on the global producer pointer in xskq_produce_addr since it has to be updated anyway, thus eliminating the use of both cached pointers. We can also remove the xskq_nb_free in xskq_produce_addr since it is already called in xskq_reserve_addr. No need to do it twice. When there is only one cached producer pointer, we can also simplify xskq_nb_free by removing one argument. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-4-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 47 ++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index b28f9da7b317..7ad80748f209 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -35,8 +35,7 @@ struct xsk_queue { u64 size; u32 ring_mask; u32 nentries; - u32 prod_head; - u32 prod_tail; + u32 cached_prod; u32 cons_head; u32 cons_tail; struct xdp_ring *ring; @@ -94,39 +93,39 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) { - u32 entries = q->prod_tail - q->cons_tail; + u32 entries = q->cached_prod - q->cons_tail; if (entries == 0) { /* Refresh the local pointer */ - q->prod_tail = READ_ONCE(q->ring->producer); - entries = q->prod_tail - q->cons_tail; + q->cached_prod = READ_ONCE(q->ring->producer); + entries = q->cached_prod - q->cons_tail; } return (entries > dcnt) ? dcnt : entries; } -static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) +static inline u32 xskq_nb_free(struct xsk_queue *q, u32 dcnt) { - u32 free_entries = q->nentries - (producer - q->cons_tail); + u32 free_entries = q->nentries - (q->cached_prod - q->cons_tail); if (free_entries >= dcnt) return free_entries; /* Refresh the local tail pointer */ q->cons_tail = READ_ONCE(q->ring->consumer); - return q->nentries - (producer - q->cons_tail); + return q->nentries - (q->cached_prod - q->cons_tail); } static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) { - u32 entries = q->prod_tail - q->cons_tail; + u32 entries = q->cached_prod - q->cons_tail; if (entries >= cnt) return true; /* Refresh the local pointer. */ - q->prod_tail = READ_ONCE(q->ring->producer); - entries = q->prod_tail - q->cons_tail; + q->cached_prod = READ_ONCE(q->ring->producer); + entries = q->cached_prod - q->cons_tail; return entries >= cnt; } @@ -220,17 +219,15 @@ static inline void xskq_discard_addr(struct xsk_queue *q) static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - if (xskq_nb_free(q, q->prod_tail, 1) == 0) - return -ENOSPC; + unsigned int idx = q->ring->producer; /* A, matches D */ - ring->desc[q->prod_tail++ & q->ring_mask] = addr; + ring->desc[idx++ & q->ring_mask] = addr; /* Order producer and data */ smp_wmb(); /* B, matches C */ - WRITE_ONCE(q->ring->producer, q->prod_tail); + WRITE_ONCE(q->ring->producer, idx); return 0; } @@ -238,11 +235,11 @@ static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (xskq_nb_free(q, q->prod_head, 1) == 0) + if (xskq_nb_free(q, 1) == 0) return -ENOSPC; /* A, matches D */ - ring->desc[q->prod_head++ & q->ring_mask] = addr; + ring->desc[q->cached_prod++ & q->ring_mask] = addr; return 0; } @@ -252,17 +249,16 @@ static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, /* Order producer and data */ smp_wmb(); /* B, matches C */ - q->prod_tail += nb_entries; - WRITE_ONCE(q->ring->producer, q->prod_tail); + WRITE_ONCE(q->ring->producer, q->ring->producer + nb_entries); } static inline int xskq_reserve_addr(struct xsk_queue *q) { - if (xskq_nb_free(q, q->prod_head, 1) == 0) + if (xskq_nb_free(q, 1) == 0) return -ENOSPC; /* A, matches D */ - q->prod_head++; + q->cached_prod++; return 0; } @@ -340,11 +336,11 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; unsigned int idx; - if (xskq_nb_free(q, q->prod_head, 1) == 0) + if (xskq_nb_free(q, 1) == 0) return -ENOSPC; /* A, matches D */ - idx = (q->prod_head++) & q->ring_mask; + idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; @@ -356,8 +352,7 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) /* Order producer and data */ smp_wmb(); /* B, matches C */ - q->prod_tail = q->prod_head; - WRITE_ONCE(q->ring->producer, q->prod_tail); + WRITE_ONCE(q->ring->producer, q->cached_prod); } static inline bool xskq_full_desc(struct xsk_queue *q) From 59e35e552529b858f35b30bc5a803ea532ca17f1 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:23 +0100 Subject: [PATCH 108/127] xsk: Standardize naming of producer ring access functions Adopt the naming of the producer ring access functions to have a similar naming convention as the functions in libbpf, but adapted to the kernel. You first reserve a number of entries that you later submit to the global state of the ring. This is much clearer, IMO, than the one that was in the kernel part. Once renamed, we also discover that two functions are actually the same, so remove one of them. Some of the primitive ring submission operations are also the same so break these out into __xskq_prod_submit that the upper level ring access functions can use. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-5-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 20 ++++++++-------- net/xdp/xsk_queue.h | 58 +++++++++++++++++++++------------------------ 2 files changed, 37 insertions(+), 41 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index e45c27f5cfca..5f2123fe630c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -167,7 +167,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) offset += metalen; addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - err = xskq_produce_batch_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len); if (!err) { xskq_discard_addr(xs->umem->fq); xdp_return_buff(xdp); @@ -180,7 +180,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); + int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); if (err) xs->rx_dropped++; @@ -216,7 +216,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_flush(struct xdp_sock *xs) { - xskq_produce_flush_desc(xs->rx); + xskq_prod_submit(xs->rx); xs->sk.sk_data_ready(&xs->sk); } @@ -247,12 +247,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) memcpy(buffer, xdp->data_meta, len + metalen); addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); - err = xskq_produce_batch_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len); if (err) goto out_drop; xskq_discard_addr(xs->umem->fq); - xskq_produce_flush_desc(xs->rx); + xskq_prod_submit(xs->rx); spin_unlock_bh(&xs->rx_lock); @@ -294,7 +294,7 @@ void __xsk_map_flush(void) void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { - xskq_produce_flush_addr_n(umem->cq, nb_entries); + xskq_prod_submit_n(umem->cq, nb_entries); } EXPORT_SYMBOL(xsk_umem_complete_tx); @@ -319,7 +319,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) if (!xskq_peek_desc(xs->tx, desc, umem)) continue; - if (xskq_produce_addr_lazy(umem->cq, desc->addr)) + if (xskq_prod_reserve_addr(umem->cq, desc->addr)) goto out; xskq_discard_desc(xs->tx); @@ -348,7 +348,7 @@ static void xsk_destruct_skb(struct sk_buff *skb) unsigned long flags; spin_lock_irqsave(&xs->tx_completion_lock, flags); - WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); + xskq_prod_submit_addr(xs->umem->cq, addr); spin_unlock_irqrestore(&xs->tx_completion_lock, flags); sock_wfree(skb); @@ -389,7 +389,7 @@ static int xsk_generic_xmit(struct sock *sk) addr = desc.addr; buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) { + if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) { kfree_skb(skb); goto out; } @@ -470,7 +470,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock, __xsk_sendmsg(sk); } - if (xs->rx && !xskq_empty_desc(xs->rx)) + if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; if (xs->tx && !xskq_full_desc(xs->tx)) mask |= EPOLLOUT | EPOLLWRNORM; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 7ad80748f209..1b9a350f2e66 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -216,22 +216,17 @@ static inline void xskq_discard_addr(struct xsk_queue *q) q->cons_tail++; } -static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) +static inline int xskq_prod_reserve(struct xsk_queue *q) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - unsigned int idx = q->ring->producer; + if (xskq_nb_free(q, 1) == 0) + return -ENOSPC; /* A, matches D */ - ring->desc[idx++ & q->ring_mask] = addr; - - /* Order producer and data */ - smp_wmb(); /* B, matches C */ - - WRITE_ONCE(q->ring->producer, idx); + q->cached_prod++; return 0; } -static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) +static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; @@ -243,23 +238,32 @@ static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) return 0; } -static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, - u32 nb_entries) +static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { /* Order producer and data */ smp_wmb(); /* B, matches C */ - WRITE_ONCE(q->ring->producer, q->ring->producer + nb_entries); + WRITE_ONCE(q->ring->producer, idx); } -static inline int xskq_reserve_addr(struct xsk_queue *q) +static inline void xskq_prod_submit(struct xsk_queue *q) { - if (xskq_nb_free(q, 1) == 0) - return -ENOSPC; + __xskq_prod_submit(q, q->cached_prod); +} - /* A, matches D */ - q->cached_prod++; - return 0; +static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 idx = q->ring->producer; + + ring->desc[idx++ & q->ring_mask] = addr; + + __xskq_prod_submit(q, idx); +} + +static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) +{ + __xskq_prod_submit(q, q->ring->producer + nb_entries); } /* Rx/Tx queue */ @@ -330,11 +334,11 @@ static inline void xskq_discard_desc(struct xsk_queue *q) q->cons_tail++; } -static inline int xskq_produce_batch_desc(struct xsk_queue *q, - u64 addr, u32 len) +static inline int xskq_prod_reserve_desc(struct xsk_queue *q, + u64 addr, u32 len) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; - unsigned int idx; + u32 idx; if (xskq_nb_free(q, 1) == 0) return -ENOSPC; @@ -347,14 +351,6 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, return 0; } -static inline void xskq_produce_flush_desc(struct xsk_queue *q) -{ - /* Order producer and data */ - smp_wmb(); /* B, matches C */ - - WRITE_ONCE(q->ring->producer, q->cached_prod); -} - static inline bool xskq_full_desc(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -362,7 +358,7 @@ static inline bool xskq_full_desc(struct xsk_queue *q) q->nentries; } -static inline bool xskq_empty_desc(struct xsk_queue *q) +static inline bool xskq_prod_is_empty(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); From 4b638f13bab4dbfe8569c84e374e8201a427115c Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:24 +0100 Subject: [PATCH 109/127] xsk: Eliminate the RX batch size In the xsk consumer ring code there is a variable called RX_BATCH_SIZE that dictates the minimum number of entries that we try to grab from the fill and Tx rings. In fact, the code always try to grab the maximum amount of entries from these rings. The only thing this variable does is to throw an error if there is less than 16 (as it is defined) entries on the ring. There is no reason to do this and it will just lead to weird behavior from user space's point of view. So eliminate this variable. With this change, we will be able to simplify the xskq_nb_free and xskq_nb_avail code in the next commit. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-6-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 1b9a350f2e66..6ca5fed87852 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -10,8 +10,6 @@ #include #include -#define RX_BATCH_SIZE 16 - struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; @@ -202,7 +200,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr, if (q->cons_tail == q->cons_head) { smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + q->cons_head = q->cons_tail + xskq_nb_avail(q, 1); /* Order consumer and data */ smp_rmb(); @@ -320,7 +318,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, if (q->cons_tail == q->cons_head) { smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + q->cons_head = q->cons_tail + xskq_nb_avail(q, 1); /* Order consumer and data */ smp_rmb(); /* C, matches B */ From df0ae6f78a45e5696427779fc3379c5d75f5d4a5 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:25 +0100 Subject: [PATCH 110/127] xsk: Simplify xskq_nb_avail and xskq_nb_free At this point, there are no users of the functions xskq_nb_avail and xskq_nb_free that take any other number of entries argument than 1, so let us get rid of the second argument that takes the number of entries. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-7-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6ca5fed87852..8bfa2ee6864c 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -89,7 +89,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) +static inline u32 xskq_nb_avail(struct xsk_queue *q) { u32 entries = q->cached_prod - q->cons_tail; @@ -99,19 +99,21 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) entries = q->cached_prod - q->cons_tail; } - return (entries > dcnt) ? dcnt : entries; + return entries; } -static inline u32 xskq_nb_free(struct xsk_queue *q, u32 dcnt) +static inline bool xskq_prod_is_full(struct xsk_queue *q) { u32 free_entries = q->nentries - (q->cached_prod - q->cons_tail); - if (free_entries >= dcnt) - return free_entries; + if (free_entries) + return false; /* Refresh the local tail pointer */ q->cons_tail = READ_ONCE(q->ring->consumer); - return q->nentries - (q->cached_prod - q->cons_tail); + free_entries = q->nentries - (q->cached_prod - q->cons_tail); + + return !free_entries; } static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) @@ -200,7 +202,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr, if (q->cons_tail == q->cons_head) { smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, 1); + q->cons_head = q->cons_tail + xskq_nb_avail(q); /* Order consumer and data */ smp_rmb(); @@ -216,7 +218,7 @@ static inline void xskq_discard_addr(struct xsk_queue *q) static inline int xskq_prod_reserve(struct xsk_queue *q) { - if (xskq_nb_free(q, 1) == 0) + if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ @@ -228,7 +230,7 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (xskq_nb_free(q, 1) == 0) + if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ @@ -318,7 +320,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, if (q->cons_tail == q->cons_head) { smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, 1); + q->cons_head = q->cons_tail + xskq_nb_avail(q); /* Order consumer and data */ smp_rmb(); /* C, matches B */ @@ -338,7 +340,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; - if (xskq_nb_free(q, 1) == 0) + if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ From c5ed924b54c892ee637d2e6889ef83341835a560 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:26 +0100 Subject: [PATCH 111/127] xsk: Simplify the consumer ring access functions Simplify and refactor consumer ring functions. The consumer first "peeks" to find descriptors or addresses that are available to read from the ring, then reads them and finally "releases" these descriptors once it is done. The two local variables cons_tail and cons_head are turned into one single variable called cached_cons. cached_tail referred to the cached value of the global consumer pointer and will be stored in cached_cons. For cached_head, we just use cached_prod instead as it was not used for a consumer queue before. It also better reflects what it really is now: a cached copy of the producer pointer. The names of the functions are also renamed in the same manner as the producer functions. The new functions are called xskq_cons_ followed by what it does. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-8-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 24 +++++------ net/xdp/xsk_queue.h | 102 ++++++++++++++++++++------------------------ 2 files changed, 58 insertions(+), 68 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5f2123fe630c..cd84b9d0f1e1 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,19 +41,19 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) { - return xskq_has_addrs(umem->fq, cnt); + return xskq_cons_has_entries(umem->fq, cnt); } EXPORT_SYMBOL(xsk_umem_has_addrs); u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) { - return xskq_peek_addr(umem->fq, addr, umem); + return xskq_cons_peek_addr(umem->fq, addr, umem); } EXPORT_SYMBOL(xsk_umem_peek_addr); void xsk_umem_discard_addr(struct xdp_umem *umem) { - xskq_discard_addr(umem->fq); + xskq_cons_release(umem->fq); } EXPORT_SYMBOL(xsk_umem_discard_addr); @@ -148,7 +148,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) u32 metalen; int err; - if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || + if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { xs->rx_dropped++; return -ENOSPC; @@ -169,7 +169,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) addr = xsk_umem_adjust_offset(xs->umem, addr, offset); err = xskq_prod_reserve_desc(xs->rx, addr, len); if (!err) { - xskq_discard_addr(xs->umem->fq); + xskq_cons_release(xs->umem->fq); xdp_return_buff(xdp); return 0; } @@ -236,7 +236,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) goto out_unlock; } - if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || + if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { err = -ENOSPC; goto out_drop; @@ -251,7 +251,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) if (err) goto out_drop; - xskq_discard_addr(xs->umem->fq); + xskq_cons_release(xs->umem->fq); xskq_prod_submit(xs->rx); spin_unlock_bh(&xs->rx_lock); @@ -316,13 +316,13 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_list, list) { - if (!xskq_peek_desc(xs->tx, desc, umem)) + if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; if (xskq_prod_reserve_addr(umem->cq, desc->addr)) goto out; - xskq_discard_desc(xs->tx); + xskq_cons_release(xs->tx); rcu_read_unlock(); return true; } @@ -368,7 +368,7 @@ static int xsk_generic_xmit(struct sock *sk) if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; - while (xskq_peek_desc(xs->tx, &desc, xs->umem)) { + while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) { char *buffer; u64 addr; u32 len; @@ -401,7 +401,7 @@ static int xsk_generic_xmit(struct sock *sk) skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); - xskq_discard_desc(xs->tx); + xskq_cons_release(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { /* SKB completed but not sent */ @@ -472,7 +472,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock, if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; - if (xs->tx && !xskq_full_desc(xs->tx)) + if (xs->tx && !xskq_cons_is_full(xs->tx)) mask |= EPOLLOUT | EPOLLWRNORM; return mask; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 8bfa2ee6864c..1436116767ea 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -34,8 +34,7 @@ struct xsk_queue { u32 ring_mask; u32 nentries; u32 cached_prod; - u32 cons_head; - u32 cons_tail; + u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; }; @@ -89,43 +88,48 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -static inline u32 xskq_nb_avail(struct xsk_queue *q) +static inline void __xskq_cons_release(struct xsk_queue *q) { - u32 entries = q->cached_prod - q->cons_tail; + smp_mb(); /* D, matches A */ + WRITE_ONCE(q->ring->consumer, q->cached_cons); +} - if (entries == 0) { - /* Refresh the local pointer */ - q->cached_prod = READ_ONCE(q->ring->producer); - entries = q->cached_prod - q->cons_tail; - } +static inline void __xskq_cons_peek(struct xsk_queue *q) +{ + /* Refresh the local pointer */ + q->cached_prod = READ_ONCE(q->ring->producer); + smp_rmb(); /* C, matches B */ +} - return entries; +static inline void xskq_cons_get_entries(struct xsk_queue *q) +{ + __xskq_cons_release(q); + __xskq_cons_peek(q); } static inline bool xskq_prod_is_full(struct xsk_queue *q) { - u32 free_entries = q->nentries - (q->cached_prod - q->cons_tail); + u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); if (free_entries) return false; /* Refresh the local tail pointer */ - q->cons_tail = READ_ONCE(q->ring->consumer); - free_entries = q->nentries - (q->cached_prod - q->cons_tail); + q->cached_cons = READ_ONCE(q->ring->consumer); + free_entries = q->nentries - (q->cached_prod - q->cached_cons); return !free_entries; } -static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) { - u32 entries = q->cached_prod - q->cons_tail; + u32 entries = q->cached_prod - q->cached_cons; if (entries >= cnt) return true; - /* Refresh the local pointer. */ - q->cached_prod = READ_ONCE(q->ring->producer); - entries = q->cached_prod - q->cons_tail; + __xskq_cons_peek(q); + entries = q->cached_prod - q->cached_cons; return entries >= cnt; } @@ -172,9 +176,10 @@ static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr, struct xdp_umem *umem) { - while (q->cons_tail != q->cons_head) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - unsigned int idx = q->cons_tail & q->ring_mask; + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + while (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; @@ -190,30 +195,27 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr, return addr; out: - q->cons_tail++; + q->cached_cons++; } return NULL; } -static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline u64 *xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) { - if (q->cons_tail == q->cons_head) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q); - - /* Order consumer and data */ - smp_rmb(); - } - + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); return xskq_validate_addr(q, addr, umem); } -static inline void xskq_discard_addr(struct xsk_queue *q) +static inline void xskq_cons_release(struct xsk_queue *q) { - q->cons_tail++; + /* To improve performance, only update local state here. + * Do the actual release operation when we get new entries + * from the ring in xskq_cons_get_entries() instead. + */ + q->cached_cons++; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -299,41 +301,29 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xdp_umem *umem) { - while (q->cons_tail != q->cons_head) { + while (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; - unsigned int idx = q->cons_tail & q->ring_mask; + u32 idx = q->cached_cons & q->ring_mask; *desc = READ_ONCE(ring->desc[idx]); if (xskq_is_valid_desc(q, desc, umem)) return desc; - q->cons_tail++; + q->cached_cons++; } return NULL; } -static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, - struct xdp_desc *desc, - struct xdp_umem *umem) +static inline struct xdp_desc *xskq_cons_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xdp_umem *umem) { - if (q->cons_tail == q->cons_head) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q); - - /* Order consumer and data */ - smp_rmb(); /* C, matches B */ - } - + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); return xskq_validate_desc(q, desc, umem); } -static inline void xskq_discard_desc(struct xsk_queue *q) -{ - q->cons_tail++; -} - static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) { @@ -351,7 +341,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, return 0; } -static inline bool xskq_full_desc(struct xsk_queue *q) +static inline bool xskq_cons_is_full(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == From 03896ef1f0cb23d2742ddf486c531c700a2da7d6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:27 +0100 Subject: [PATCH 112/127] xsk: Change names of validation functions Change the names of the validation functions to better reflect what they are doing. The uppermost ones are reading entries from the rings and only the bottom ones validate entries. So xskq_cons_read_ is a better prefix name. Also change the xskq_cons_read_ functions to return a bool as the the descriptor or address is already returned by reference in the parameters. Everyone is using the return value as a bool anyway. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-9-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 4 +-- net/xdp/xsk.c | 4 +-- net/xdp/xsk_queue.h | 59 ++++++++++++++++++++++-------------------- 3 files changed, 35 insertions(+), 32 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 48594740d67c..63f005830866 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -118,7 +118,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); -u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); void xsk_umem_discard_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); @@ -197,7 +197,7 @@ static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) return xsk_umem_has_addrs(umem, cnt - rq->length); } -static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) { struct xdp_umem_fq_reuse *rq = umem->fq_reuse; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cd84b9d0f1e1..4e932a930b3a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -45,7 +45,7 @@ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) } EXPORT_SYMBOL(xsk_umem_has_addrs); -u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) { return xskq_cons_peek_addr(umem->fq, addr, umem); } @@ -126,7 +126,7 @@ static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, void *to_buf = xdp_umem_get_data(umem, addr); addr = xsk_umem_add_offset_to_addr(addr); - if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) { + if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; u64 page_start = addr & ~(PAGE_SIZE - 1); u64 first_len = PAGE_SIZE - (addr - page_start); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 1436116767ea..6d04a96dd092 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -136,8 +136,9 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) /* UMEM queue */ -static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr, - u64 length) +static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, + u64 addr, + u64 length) { bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; bool next_pg_contig = @@ -147,7 +148,7 @@ static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr, return cross_pg && !next_pg_contig; } -static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) +static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) { if (addr >= q->size) { q->invalid_descs++; @@ -157,7 +158,8 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) return true; } -static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, +static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, + u64 addr, u64 length, struct xdp_umem *umem) { @@ -165,7 +167,7 @@ static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, addr = xsk_umem_add_offset_to_addr(addr); if (base_addr >= q->size || addr >= q->size || - xskq_crosses_non_contig_pg(umem, addr, length)) { + xskq_cons_crosses_non_contig_pg(umem, addr, length)) { q->invalid_descs++; return false; } @@ -173,8 +175,8 @@ static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, return true; } -static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; @@ -184,29 +186,29 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr, *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (xskq_is_valid_addr_unaligned(q, *addr, + if (xskq_cons_is_valid_unaligned(q, *addr, umem->chunk_size_nohr, umem)) - return addr; + return true; goto out; } - if (xskq_is_valid_addr(q, *addr)) - return addr; + if (xskq_cons_is_valid_addr(q, *addr)) + return true; out: q->cached_cons++; } - return NULL; + return false; } -static inline u64 *xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, +static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, struct xdp_umem *umem) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); - return xskq_validate_addr(q, addr, umem); + return xskq_cons_read_addr(q, addr, umem); } static inline void xskq_cons_release(struct xsk_queue *q) @@ -270,11 +272,12 @@ static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) /* Rx/Tx queue */ -static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, - struct xdp_umem *umem) +static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, + struct xdp_desc *d, + struct xdp_umem *umem) { if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem)) + if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) return false; if (d->len > umem->chunk_size_nohr || d->options) { @@ -285,7 +288,7 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, return true; } - if (!xskq_is_valid_addr(q, d->addr)) + if (!xskq_cons_is_valid_addr(q, d->addr)) return false; if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || @@ -297,31 +300,31 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, return true; } -static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, - struct xdp_desc *desc, - struct xdp_umem *umem) +static inline bool xskq_cons_read_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xdp_umem *umem) { while (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = READ_ONCE(ring->desc[idx]); - if (xskq_is_valid_desc(q, desc, umem)) - return desc; + if (xskq_cons_is_valid_desc(q, desc, umem)) + return true; q->cached_cons++; } - return NULL; + return false; } -static inline struct xdp_desc *xskq_cons_peek_desc(struct xsk_queue *q, - struct xdp_desc *desc, - struct xdp_umem *umem) +static inline bool xskq_cons_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xdp_umem *umem) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); - return xskq_validate_desc(q, desc, umem); + return xskq_cons_read_desc(q, desc, umem); } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, From f8509aa078de0842ec1817e8026e58620cd05d3b Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:28 +0100 Subject: [PATCH 113/127] xsk: ixgbe: i40e: ice: mlx5: Xsk_umem_discard_addr to xsk_umem_release_addr Change the name of xsk_umem_discard_addr to xsk_umem_release_addr to better reflect the new naming of the AF_XDP queue manipulation functions. As this functions is used by drivers implementing support for AF_XDP zero-copy, it requires a name change to these drivers. The function xsk_umem_release_addr_rq has also changed name in the same fashion. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-10-git-send-email-magnus.karlsson@intel.com --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_xsk.c | 4 ++-- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c | 2 +- include/net/xdp_sock.h | 10 +++++----- net/xdp/xsk.c | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index d07e1a890428..20d6f11a3468 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -269,7 +269,7 @@ static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring, bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - xsk_umem_discard_addr(umem); + xsk_umem_release_addr(umem); return true; } @@ -306,7 +306,7 @@ static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring, bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - xsk_umem_discard_addr_rq(umem); + xsk_umem_release_addr_rq(umem); return true; } diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index cf9b8b22d24f..0af1f0b951c0 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -555,7 +555,7 @@ ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) rx_buf->handle = handle + umem->headroom; - xsk_umem_discard_addr(umem); + xsk_umem_release_addr(umem); return true; } @@ -591,7 +591,7 @@ ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) rx_buf->handle = handle + umem->headroom; - xsk_umem_discard_addr_rq(umem); + xsk_umem_release_addr_rq(umem); return true; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index d6feaacfbf89..1501d0a22078 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -277,7 +277,7 @@ static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring, bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - xsk_umem_discard_addr(umem); + xsk_umem_release_addr(umem); return true; } @@ -304,7 +304,7 @@ static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring, bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - xsk_umem_discard_addr_rq(umem); + xsk_umem_release_addr_rq(umem); return true; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c index 475b6bd5d29b..62fc8a128a8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c @@ -35,7 +35,7 @@ int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq, */ dma_info->addr = xdp_umem_get_dma(umem, handle); - xsk_umem_discard_addr_rq(umem); + xsk_umem_release_addr_rq(umem); dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE, DMA_BIDIRECTIONAL); diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 63f005830866..e86ec48ef627 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -119,7 +119,7 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); -void xsk_umem_discard_addr(struct xdp_umem *umem); +void xsk_umem_release_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); void xsk_umem_consume_tx_done(struct xdp_umem *umem); @@ -208,12 +208,12 @@ static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) return addr; } -static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem) +static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) { struct xdp_umem_fq_reuse *rq = umem->fq_reuse; if (!rq->length) - xsk_umem_discard_addr(umem); + xsk_umem_release_addr(umem); else rq->length--; } @@ -258,7 +258,7 @@ static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) return NULL; } -static inline void xsk_umem_discard_addr(struct xdp_umem *umem) +static inline void xsk_umem_release_addr(struct xdp_umem *umem) { } @@ -332,7 +332,7 @@ static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) return NULL; } -static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem) +static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) { } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4e932a930b3a..55092feeaf36 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -51,11 +51,11 @@ bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) } EXPORT_SYMBOL(xsk_umem_peek_addr); -void xsk_umem_discard_addr(struct xdp_umem *umem) +void xsk_umem_release_addr(struct xdp_umem *umem) { xskq_cons_release(umem->fq); } -EXPORT_SYMBOL(xsk_umem_discard_addr); +EXPORT_SYMBOL(xsk_umem_release_addr); void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { From c34787fcc90f0732ff00754057f11780007002e4 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:29 +0100 Subject: [PATCH 114/127] xsk: Remove unnecessary READ_ONCE of data There are two unnecessary READ_ONCE of descriptor data. These are not needed since the data is written by the producer before it signals that the data is available by incrementing the producer pointer. As the access to this producer pointer is serialized and the consumer always reads the descriptor after it has read and synchronized with the producer counter, the write of the descriptor will have fully completed and it does not matter if the consumer has any read tearing. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-11-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6d04a96dd092..4c049cacc693 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -183,7 +183,7 @@ static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, while (q->cached_cons != q->cached_prod) { u32 idx = q->cached_cons & q->ring_mask; - *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; + *addr = ring->desc[idx] & q->chunk_mask; if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { if (xskq_cons_is_valid_unaligned(q, *addr, @@ -308,7 +308,7 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; - *desc = READ_ONCE(ring->desc[idx]); + *desc = ring->desc[idx]; if (xskq_cons_is_valid_desc(q, desc, umem)) return true; From 15d8c9162ced1789d25261859a7b37db8426e409 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:30 +0100 Subject: [PATCH 115/127] xsk: Add function naming comments and reorder functions Add comments on how the ring access functions are named and how they are supposed to be used for producers and consumers. The functions are also reordered so that the consumer functions are in the beginning and the producer functions in the end, for easier reference. Put this in a separate patch as the diff might look a little odd, but no functionality has changed in this patch. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-12-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 10 ++ net/xdp/xsk_queue.h | 293 ++++++++++++++++++++++++-------------------- 2 files changed, 167 insertions(+), 136 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 55092feeaf36..5560d60c1ff5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -319,6 +319,11 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; + /* This is the backpreassure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ if (xskq_prod_reserve_addr(umem->cq, desc->addr)) goto out; @@ -389,6 +394,11 @@ static int xsk_generic_xmit(struct sock *sk) addr = desc.addr; buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); + /* This is the backpreassure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) { kfree_skb(skb); goto out; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 4c049cacc693..bec2af11853a 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -81,60 +81,27 @@ struct xsk_queue { * now and again after circling through the ring. */ -/* Common functions operating for both RXTX and umem queues */ +/* The operations on the rings are the following: + * + * producer consumer + * + * RESERVE entries PEEK in the ring for entries + * WRITE data into the ring READ data from the ring + * SUBMIT entries RELEASE entries + * + * The producer reserves one or more entries in the ring. It can then + * fill in these entries and finally submit them so that they can be + * seen and read by the consumer. + * + * The consumer peeks into the ring to see if the producer has written + * any new entries. If so, the producer can then read these entries + * and when it is done reading them release them back to the producer + * so that the producer can use these slots to fill in new entries. + * + * The function names below reflect these operations. + */ -static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) -{ - return q ? q->invalid_descs : 0; -} - -static inline void __xskq_cons_release(struct xsk_queue *q) -{ - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cached_cons); -} - -static inline void __xskq_cons_peek(struct xsk_queue *q) -{ - /* Refresh the local pointer */ - q->cached_prod = READ_ONCE(q->ring->producer); - smp_rmb(); /* C, matches B */ -} - -static inline void xskq_cons_get_entries(struct xsk_queue *q) -{ - __xskq_cons_release(q); - __xskq_cons_peek(q); -} - -static inline bool xskq_prod_is_full(struct xsk_queue *q) -{ - u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); - - if (free_entries) - return false; - - /* Refresh the local tail pointer */ - q->cached_cons = READ_ONCE(q->ring->consumer); - free_entries = q->nentries - (q->cached_prod - q->cached_cons); - - return !free_entries; -} - -static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) -{ - u32 entries = q->cached_prod - q->cached_cons; - - if (entries >= cnt) - return true; - - __xskq_cons_peek(q); - entries = q->cached_prod - q->cached_cons; - - return entries >= cnt; -} - -/* UMEM queue */ +/* Functions that read and validate content from consumer rings. */ static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr, @@ -148,16 +115,6 @@ static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, return cross_pg && !next_pg_contig; } -static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) -{ - if (addr >= q->size) { - q->invalid_descs++; - return false; - } - - return true; -} - static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, u64 addr, u64 length, @@ -175,6 +132,16 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, return true; } +static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) +{ + if (addr >= q->size) { + q->invalid_descs++; + return false; + } + + return true; +} + static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, struct xdp_umem *umem) { @@ -203,75 +170,6 @@ out: return false; } -static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) -{ - if (q->cached_prod == q->cached_cons) - xskq_cons_get_entries(q); - return xskq_cons_read_addr(q, addr, umem); -} - -static inline void xskq_cons_release(struct xsk_queue *q) -{ - /* To improve performance, only update local state here. - * Do the actual release operation when we get new entries - * from the ring in xskq_cons_get_entries() instead. - */ - q->cached_cons++; -} - -static inline int xskq_prod_reserve(struct xsk_queue *q) -{ - if (xskq_prod_is_full(q)) - return -ENOSPC; - - /* A, matches D */ - q->cached_prod++; - return 0; -} - -static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - if (xskq_prod_is_full(q)) - return -ENOSPC; - - /* A, matches D */ - ring->desc[q->cached_prod++ & q->ring_mask] = addr; - return 0; -} - -static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) -{ - /* Order producer and data */ - smp_wmb(); /* B, matches C */ - - WRITE_ONCE(q->ring->producer, idx); -} - -static inline void xskq_prod_submit(struct xsk_queue *q) -{ - __xskq_prod_submit(q, q->cached_prod); -} - -static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - u32 idx = q->ring->producer; - - ring->desc[idx++ & q->ring_mask] = addr; - - __xskq_prod_submit(q, idx); -} - -static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) -{ - __xskq_prod_submit(q, q->ring->producer + nb_entries); -} - -/* Rx/Tx queue */ - static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xdp_umem *umem) @@ -318,6 +216,48 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, return false; } +/* Functions for consumers */ + +static inline void __xskq_cons_release(struct xsk_queue *q) +{ + smp_mb(); /* D, matches A */ + WRITE_ONCE(q->ring->consumer, q->cached_cons); +} + +static inline void __xskq_cons_peek(struct xsk_queue *q) +{ + /* Refresh the local pointer */ + q->cached_prod = READ_ONCE(q->ring->producer); + smp_rmb(); /* C, matches B */ +} + +static inline void xskq_cons_get_entries(struct xsk_queue *q) +{ + __xskq_cons_release(q); + __xskq_cons_peek(q); +} + +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +{ + u32 entries = q->cached_prod - q->cached_cons; + + if (entries >= cnt) + return true; + + __xskq_cons_peek(q); + entries = q->cached_prod - q->cached_cons; + + return entries >= cnt; +} + +static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr(q, addr, umem); +} + static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xdp_umem *umem) @@ -327,6 +267,60 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q, return xskq_cons_read_desc(q, desc, umem); } +static inline void xskq_cons_release(struct xsk_queue *q) +{ + /* To improve performance, only update local state here. + * Reflect this to global state when we get new entries + * from the ring in xskq_cons_get_entries(). + */ + q->cached_cons++; +} + +static inline bool xskq_cons_is_full(struct xsk_queue *q) +{ + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == + q->nentries; +} + +/* Functions for producers */ + +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + if (free_entries) + return false; + + /* Refresh the local tail pointer */ + q->cached_cons = READ_ONCE(q->ring->consumer); + free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + return !free_entries; +} + +static inline int xskq_prod_reserve(struct xsk_queue *q) +{ + if (xskq_prod_is_full(q)) + return -ENOSPC; + + /* A, matches D */ + q->cached_prod++; + return 0; +} + +static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + if (xskq_prod_is_full(q)) + return -ENOSPC; + + /* A, matches D */ + ring->desc[q->cached_prod++ & q->ring_mask] = addr; + return 0; +} + static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) { @@ -344,11 +338,31 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, return 0; } -static inline bool xskq_cons_is_full(struct xsk_queue *q) +static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { - /* No barriers needed since data is not accessed */ - return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == - q->nentries; + smp_wmb(); /* B, matches C */ + + WRITE_ONCE(q->ring->producer, idx); +} + +static inline void xskq_prod_submit(struct xsk_queue *q) +{ + __xskq_prod_submit(q, q->cached_prod); +} + +static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 idx = q->ring->producer; + + ring->desc[idx++ & q->ring_mask] = addr; + + __xskq_prod_submit(q, idx); +} + +static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) +{ + __xskq_prod_submit(q, q->ring->producer + nb_entries); } static inline bool xskq_prod_is_empty(struct xsk_queue *q) @@ -357,6 +371,13 @@ static inline bool xskq_prod_is_empty(struct xsk_queue *q) return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); } +/* For both producers and consumers */ + +static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) +{ + return q ? q->invalid_descs : 0; +} + void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); From 1d9cb1f381860b529edec57cf7a08133f40366eb Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:31 +0100 Subject: [PATCH 116/127] xsk: Use struct_size() helper Improve readability and maintainability by using the struct_size() helper when allocating the AF_XDP rings. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-13-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index b66504592d9b..c90e9c1e3c63 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -18,14 +18,14 @@ void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) q->chunk_mask = chunk_mask; } -static u32 xskq_umem_get_ring_size(struct xsk_queue *q) +static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { - return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64); -} + struct xdp_umem_ring *umem_ring; + struct xdp_rxtx_ring *rxtx_ring; -static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) -{ - return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc); + if (umem_queue) + return struct_size(umem_ring, desc, q->nentries); + return struct_size(rxtx_ring, desc, q->nentries); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) @@ -43,8 +43,7 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | __GFP_NORETRY; - size = umem_queue ? xskq_umem_get_ring_size(q) : - xskq_rxtx_get_ring_size(q); + size = xskq_get_ring_size(q, umem_queue); q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, get_order(size)); From 478bee0df0ec9067c12e7d058d78721a7e7a1b29 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 19 Dec 2019 16:05:11 -0800 Subject: [PATCH 117/127] selftests/bpf: Preserve errno in test_progs CHECK macros It's follow-up for discussion [1] CHECK and CHECK_FAIL macros in test_progs.h can affect errno in some circumstances, e.g. if some code accidentally closes stdout. It makes checking errno in patterns like this unreliable: if (CHECK(!bpf_prog_attach_xattr(...), "tag", "msg")) goto err; CHECK_FAIL(errno != ENOENT); , since by CHECK_FAIL time errno could be affected not only by bpf_prog_attach_xattr but by CHECK as well. Fix it by saving and restoring errno in the macros. There is no "Fixes" tag since no problems were discovered yet and it's rather precaution. test_progs was run with this change and no difference was identified. [1] https://lore.kernel.org/bpf/20191219210907.GD16266@rdna-mbp.dhcp.thefacebook.com/ Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191220000511.1684853-1-rdna@fb.com --- tools/testing/selftests/bpf/test_progs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 8477df835979..de1fdaa4e7b4 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -100,6 +100,7 @@ extern struct ipv6_packet pkt_v6; #define _CHECK(condition, tag, duration, format...) ({ \ int __ret = !!(condition); \ + int __save_errno = errno; \ if (__ret) { \ test__fail(); \ printf("%s:FAIL:%s ", __func__, tag); \ @@ -108,15 +109,18 @@ extern struct ipv6_packet pkt_v6; printf("%s:PASS:%s %d nsec\n", \ __func__, tag, duration); \ } \ + errno = __save_errno; \ __ret; \ }) #define CHECK_FAIL(condition) ({ \ int __ret = !!(condition); \ + int __save_errno = errno; \ if (__ret) { \ test__fail(); \ printf("%s:FAIL:%d\n", __func__, __LINE__); \ } \ + errno = __save_errno; \ __ret; \ }) From d3f11b018f6ce278e683008e9c225fe87afc532d Mon Sep 17 00:00:00 2001 From: Jay Jayatheerthan Date: Fri, 20 Dec 2019 14:25:25 +0530 Subject: [PATCH 118/127] samples/bpf: xdpsock: Add duration option to specify how long to run The application now supports '-d' or '--duration' option to specify number of seconds to run. This is used in tx, rx and l2fwd features. If this option is not provided, the application runs forever. Signed-off-by: Jay Jayatheerthan Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191220085530.4980-2-jay.jayatheerthan@intel.com --- samples/bpf/xdpsock_user.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index e7829e5baaff..e188a79a9c31 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -65,6 +65,9 @@ static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static const char *opt_if = ""; static int opt_ifindex; static int opt_queue; +static unsigned long opt_duration; +static unsigned long start_time; +static bool benchmark_done; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -167,10 +170,21 @@ static void dump_stats(void) } } +static bool is_benchmark_done(void) +{ + if (opt_duration > 0) { + unsigned long dt = (get_nsecs() - start_time); + + if (dt >= opt_duration) + benchmark_done = true; + } + return benchmark_done; +} + static void *poller(void *arg) { (void)arg; - for (;;) { + while (!is_benchmark_done()) { sleep(opt_interval); dump_stats(); } @@ -375,6 +389,7 @@ static struct option long_options[] = { {"unaligned", no_argument, 0, 'u'}, {"shared-umem", no_argument, 0, 'M'}, {"force", no_argument, 0, 'F'}, + {"duration", required_argument, 0, 'd'}, {0, 0, 0, 0} }; @@ -399,6 +414,8 @@ static void usage(const char *prog) " -u, --unaligned Enable unaligned chunk placement\n" " -M, --shared-umem Enable XDP_SHARED_UMEM\n" " -F, --force Force loading the XDP prog\n" + " -d, --duration=n Duration in secs to run command.\n" + " Default: forever.\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE); exit(EXIT_FAILURE); @@ -411,7 +428,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:muM", + c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:muMd:", long_options, &option_index); if (c == -1) break; @@ -469,6 +486,10 @@ static void parse_command_line(int argc, char **argv) case 'M': opt_num_xsks = MAX_SOCKS; break; + case 'd': + opt_duration = atoi(optarg); + opt_duration *= 1000000000; + break; default: usage(basename(argv[0])); } @@ -622,6 +643,9 @@ static void rx_drop_all(void) for (i = 0; i < num_socks; i++) rx_drop(xsks[i], fds); + + if (benchmark_done) + break; } } @@ -671,6 +695,9 @@ static void tx_only_all(void) for (i = 0; i < num_socks; i++) tx_only(xsks[i], frame_nb[i]); + + if (benchmark_done) + break; } } @@ -739,6 +766,9 @@ static void l2fwd_all(void) for (i = 0; i < num_socks; i++) l2fwd(xsks[i], fds); + + if (benchmark_done) + break; } } @@ -852,6 +882,7 @@ int main(int argc, char **argv) exit_with_error(ret); prev_time = get_nsecs(); + start_time = prev_time; if (opt_bench == BENCH_RXDROP) rx_drop_all(); @@ -860,5 +891,7 @@ int main(int argc, char **argv) else l2fwd_all(); + pthread_join(pt, NULL); + return 0; } From 695255882bdf63da240db33d0f2aa9ccca1cbe67 Mon Sep 17 00:00:00 2001 From: Jay Jayatheerthan Date: Fri, 20 Dec 2019 14:25:26 +0530 Subject: [PATCH 119/127] samples/bpf: xdpsock: Use common code to handle signal and main exit Add code to do cleanup for signals and application completion in a unified fashion. The signal handler sets benckmark_done flag terminating the threads. The cleanup is called before returning from main() function. Signed-off-by: Jay Jayatheerthan Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191220085530.4980-3-jay.jayatheerthan@intel.com --- samples/bpf/xdpsock_user.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index e188a79a9c31..7febc3d519a1 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -209,6 +209,11 @@ static void remove_xdp_program(void) } static void int_exit(int sig) +{ + benchmark_done = true; +} + +static void xdpsock_cleanup(void) { struct xsk_umem *umem = xsks[0]->umem->umem; int i; @@ -218,8 +223,6 @@ static void int_exit(int sig) xsk_socket__delete(xsks[i]->xsk); (void)xsk_umem__delete(umem); remove_xdp_program(); - - exit(EXIT_SUCCESS); } static void __exit_with_error(int error, const char *file, const char *func, @@ -893,5 +896,7 @@ int main(int argc, char **argv) pthread_join(pt, NULL); + xdpsock_cleanup(); + return 0; } From cd9e72b6f21044b36a096833003811c2b2038455 Mon Sep 17 00:00:00 2001 From: Jay Jayatheerthan Date: Fri, 20 Dec 2019 14:25:27 +0530 Subject: [PATCH 120/127] samples/bpf: xdpsock: Add option to specify batch size New option to specify batch size for tx, rx and l2fwd has been added. This allows fine tuning to maximize performance. It is specified using '-b' or '--batch_size' options. When not specified default is 64. Signed-off-by: Jay Jayatheerthan Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191220085530.4980-4-jay.jayatheerthan@intel.com --- samples/bpf/xdpsock_user.c | 54 +++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 7febc3d519a1..1ba3e7142f39 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -45,7 +45,6 @@ #endif #define NUM_FRAMES (4 * 1024) -#define BATCH_SIZE 64 #define DEBUG_HEXDUMP 0 @@ -68,6 +67,7 @@ static int opt_queue; static unsigned long opt_duration; static unsigned long start_time; static bool benchmark_done; +static u32 opt_batch_size = 64; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -237,7 +237,6 @@ static void __exit_with_error(int error, const char *file, const char *func, #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \ __LINE__) - static const char pkt_data[] = "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00" "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14" @@ -291,11 +290,10 @@ static void hex_dump(void *pkt, size_t length, u64 addr) printf("\n"); } -static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr) +static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) { memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, sizeof(pkt_data) - 1); - return sizeof(pkt_data) - 1; } static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) @@ -393,6 +391,7 @@ static struct option long_options[] = { {"shared-umem", no_argument, 0, 'M'}, {"force", no_argument, 0, 'F'}, {"duration", required_argument, 0, 'd'}, + {"batch-size", required_argument, 0, 'b'}, {0, 0, 0, 0} }; @@ -419,8 +418,11 @@ static void usage(const char *prog) " -F, --force Force loading the XDP prog\n" " -d, --duration=n Duration in secs to run command.\n" " Default: forever.\n" + " -b, --batch-size=n Batch size for sending or receiving\n" + " packets. Default: %d\n" "\n"; - fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE); + fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, + opt_batch_size); exit(EXIT_FAILURE); } @@ -431,7 +433,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:muMd:", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:", long_options, &option_index); if (c == -1) break; @@ -493,6 +495,9 @@ static void parse_command_line(int argc, char **argv) opt_duration = atoi(optarg); opt_duration *= 1000000000; break; + case 'b': + opt_batch_size = atoi(optarg); + break; default: usage(basename(argv[0])); } @@ -540,7 +545,7 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) kick_tx(xsk); - ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE : + ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : xsk->outstanding_tx; /* re-add completed Tx buffers */ @@ -580,7 +585,7 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk) if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) kick_tx(xsk); - rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx); + rcvd = xsk_ring_cons__peek(&xsk->umem->cq, opt_batch_size, &idx); if (rcvd > 0) { xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; @@ -594,7 +599,7 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) u32 idx_rx = 0, idx_fq = 0; int ret; - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); if (!rcvd) { if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) ret = poll(fds, num_socks, opt_timeout); @@ -655,23 +660,24 @@ static void rx_drop_all(void) static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb) { u32 idx; + unsigned int i; - if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == BATCH_SIZE) { - unsigned int i; - - for (i = 0; i < BATCH_SIZE; i++) { - xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr = - (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; - xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len = - sizeof(pkt_data) - 1; - } - - xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE); - xsk->outstanding_tx += BATCH_SIZE; - frame_nb += BATCH_SIZE; - frame_nb %= NUM_FRAMES; + while (xsk_ring_prod__reserve(&xsk->tx, opt_batch_size, &idx) < + opt_batch_size) { + complete_tx_only(xsk); } + for (i = 0; i < opt_batch_size; i++) { + struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, + idx + i); + tx_desc->addr = (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; + tx_desc->len = sizeof(pkt_data) - 1; + } + + xsk_ring_prod__submit(&xsk->tx, opt_batch_size); + xsk->outstanding_tx += opt_batch_size; + frame_nb += opt_batch_size; + frame_nb %= NUM_FRAMES; complete_tx_only(xsk); } @@ -712,7 +718,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) complete_tx_l2fwd(xsk, fds); - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); if (!rcvd) { if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) ret = poll(fds, num_socks, opt_timeout); From ece6e9694751a4f0b99372724a0705a0217132b3 Mon Sep 17 00:00:00 2001 From: Jay Jayatheerthan Date: Fri, 20 Dec 2019 14:25:28 +0530 Subject: [PATCH 121/127] samples/bpf: xdpsock: Add option to specify number of packets to send Use '-C' or '--tx-pkt-count' to specify number of packets to send. If it is not specified, the application sends packets forever. If packet count is not a multiple of batch size, last batch sent is less than the batch size. Signed-off-by: Jay Jayatheerthan Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191220085530.4980-5-jay.jayatheerthan@intel.com --- samples/bpf/xdpsock_user.c | 73 ++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 1ba3e7142f39..f96ce3055d46 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -68,6 +68,7 @@ static unsigned long opt_duration; static unsigned long start_time; static bool benchmark_done; static u32 opt_batch_size = 64; +static int opt_pkt_count; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -392,6 +393,7 @@ static struct option long_options[] = { {"force", no_argument, 0, 'F'}, {"duration", required_argument, 0, 'd'}, {"batch-size", required_argument, 0, 'b'}, + {"tx-pkt-count", required_argument, 0, 'C'}, {0, 0, 0, 0} }; @@ -420,6 +422,8 @@ static void usage(const char *prog) " Default: forever.\n" " -b, --batch-size=n Batch size for sending or receiving\n" " packets. Default: %d\n" + " -C, --tx-pkt-count=n Number of packets to send.\n" + " Default: Continuous packets.\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size); @@ -433,7 +437,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:", long_options, &option_index); if (c == -1) break; @@ -498,6 +502,9 @@ static void parse_command_line(int argc, char **argv) case 'b': opt_batch_size = atoi(optarg); break; + case 'C': + opt_pkt_count = atoi(optarg); + break; default: usage(basename(argv[0])); } @@ -574,7 +581,8 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, } } -static inline void complete_tx_only(struct xsk_socket_info *xsk) +static inline void complete_tx_only(struct xsk_socket_info *xsk, + int batch_size) { unsigned int rcvd; u32 idx; @@ -585,7 +593,7 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk) if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) kick_tx(xsk); - rcvd = xsk_ring_cons__peek(&xsk->umem->cq, opt_batch_size, &idx); + rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); if (rcvd > 0) { xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; @@ -657,34 +665,62 @@ static void rx_drop_all(void) } } -static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb) +static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb, int batch_size) { u32 idx; unsigned int i; - while (xsk_ring_prod__reserve(&xsk->tx, opt_batch_size, &idx) < - opt_batch_size) { - complete_tx_only(xsk); + while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < + batch_size) { + complete_tx_only(xsk, batch_size); } - for (i = 0; i < opt_batch_size; i++) { + for (i = 0; i < batch_size; i++) { struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); tx_desc->addr = (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; tx_desc->len = sizeof(pkt_data) - 1; } - xsk_ring_prod__submit(&xsk->tx, opt_batch_size); - xsk->outstanding_tx += opt_batch_size; - frame_nb += opt_batch_size; + xsk_ring_prod__submit(&xsk->tx, batch_size); + xsk->outstanding_tx += batch_size; + frame_nb += batch_size; frame_nb %= NUM_FRAMES; - complete_tx_only(xsk); + complete_tx_only(xsk, batch_size); +} + +static inline int get_batch_size(int pkt_cnt) +{ + if (!opt_pkt_count) + return opt_batch_size; + + if (pkt_cnt + opt_batch_size <= opt_pkt_count) + return opt_batch_size; + + return opt_pkt_count - pkt_cnt; +} + +static void complete_tx_only_all(void) +{ + bool pending; + int i; + + do { + pending = false; + for (i = 0; i < num_socks; i++) { + if (xsks[i]->outstanding_tx) { + complete_tx_only(xsks[i], opt_batch_size); + pending = !!xsks[i]->outstanding_tx; + } + } + } while (pending); } static void tx_only_all(void) { struct pollfd fds[MAX_SOCKS] = {}; u32 frame_nb[MAX_SOCKS] = {}; + int pkt_cnt = 0; int i, ret; for (i = 0; i < num_socks; i++) { @@ -692,7 +728,9 @@ static void tx_only_all(void) fds[0].events = POLLOUT; } - for (;;) { + while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { + int batch_size = get_batch_size(pkt_cnt); + if (opt_poll) { ret = poll(fds, num_socks, opt_timeout); if (ret <= 0) @@ -703,11 +741,16 @@ static void tx_only_all(void) } for (i = 0; i < num_socks; i++) - tx_only(xsks[i], frame_nb[i]); + tx_only(xsks[i], frame_nb[i], batch_size); + + pkt_cnt += batch_size; if (benchmark_done) break; } + + if (opt_pkt_count) + complete_tx_only_all(); } static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) @@ -900,6 +943,8 @@ int main(int argc, char **argv) else l2fwd_all(); + benchmark_done = true; + pthread_join(pt, NULL); xdpsock_cleanup(); From 4a3c23ae3acc6185a9d418830f22672a52ff986a Mon Sep 17 00:00:00 2001 From: Jay Jayatheerthan Date: Fri, 20 Dec 2019 14:25:29 +0530 Subject: [PATCH 122/127] samples/bpf: xdpsock: Add option to specify tx packet size New option '-s' or '--tx-pkt-size' has been added to specify the transmit packet size. The packet size ranges from 47 to 4096 bytes. When this option is not provided, it defaults to 64 byte packet. The code uses struct ethhdr, struct iphdr and struct udphdr to form the transmit packet. The MAC address, IP address and UDP ports are set to default values. The code calculates IP and UDP checksums before sending the packet. Checksum calculation code in Linux kernel is used for this purpose. The Ethernet FCS is not filled by the code. Signed-off-by: Jay Jayatheerthan Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191220085530.4980-6-jay.jayatheerthan@intel.com --- samples/bpf/xdpsock_user.c | 276 +++++++++++++++++++++++++++++++++++-- 1 file changed, 265 insertions(+), 11 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index f96ce3055d46..2297158a32bd 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -10,6 +10,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -45,11 +48,14 @@ #endif #define NUM_FRAMES (4 * 1024) +#define MIN_PKT_SIZE 64 #define DEBUG_HEXDUMP 0 typedef __u64 u64; typedef __u32 u32; +typedef __u16 u16; +typedef __u8 u8; static unsigned long prev_time; @@ -69,6 +75,8 @@ static unsigned long start_time; static bool benchmark_done; static u32 opt_batch_size = 64; static int opt_pkt_count; +static u16 opt_pkt_size = MIN_PKT_SIZE; +static u32 pkt_fill_pattern = 0x12345678; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -238,12 +246,6 @@ static void __exit_with_error(int error, const char *file, const char *func, #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \ __LINE__) -static const char pkt_data[] = - "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00" - "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14" - "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b" - "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa"; - static void swap_mac_addresses(void *data) { struct ether_header *eth = (struct ether_header *)data; @@ -291,10 +293,243 @@ static void hex_dump(void *pkt, size_t length, u64 addr) printf("\n"); } +static void *memset32_htonl(void *dest, u32 val, u32 size) +{ + u32 *ptr = (u32 *)dest; + int i; + + val = htonl(val); + + for (i = 0; i < (size & (~0x3)); i += 4) + ptr[i >> 2] = val; + + for (; i < size; i++) + ((char *)dest)[i] = ((char *)&val)[i & 3]; + + return dest; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static inline unsigned short from32to16(unsigned int x) +{ + /* add up 16-bit and 16-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static unsigned int do_csum(const unsigned char *buff, int len) +{ + unsigned int result = 0; + int odd; + + if (len <= 0) + goto out; + odd = 1 & (unsigned long)buff; + if (odd) { +#ifdef __LITTLE_ENDIAN + result += (*buff << 8); +#else + result = *buff; +#endif + len--; + buff++; + } + if (len >= 2) { + if (2 & (unsigned long)buff) { + result += *(unsigned short *)buff; + len -= 2; + buff += 2; + } + if (len >= 4) { + const unsigned char *end = buff + + ((unsigned int)len & ~3); + unsigned int carry = 0; + + do { + unsigned int w = *(unsigned int *)buff; + + buff += 4; + result += carry; + result += w; + carry = (w > result); + } while (buff < end); + result += carry; + result = (result & 0xffff) + (result >> 16); + } + if (len & 2) { + result += *(unsigned short *)buff; + buff += 2; + } + } + if (len & 1) +#ifdef __LITTLE_ENDIAN + result += *buff; +#else + result += (*buff << 8); +#endif + result = from32to16(result); + if (odd) + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); +out: + return result; +} + +__sum16 ip_fast_csum(const void *iph, unsigned int ihl); + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +__sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + return (__force __sum16)~do_csum(iph, ihl * 4); +} + +/* + * Fold a partial checksum + * This function code has been taken from + * Linux kernel include/asm-generic/checksum.h + */ +static inline __sum16 csum_fold(__wsum csum) +{ + u32 sum = (__force u32)csum; + + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return (__force __sum16)~sum; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static inline u32 from64to32(u64 x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + return (u32)x; +} + +__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, + __u32 len, __u8 proto, __wsum sum); + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, + __u32 len, __u8 proto, __wsum sum) +{ + unsigned long long s = (__force u32)sum; + + s += (__force u32)saddr; + s += (__force u32)daddr; +#ifdef __BIG_ENDIAN__ + s += proto + len; +#else + s += (proto + len) << 8; +#endif + return (__force __wsum)from64to32(s); +} + +/* + * This function has been taken from + * Linux kernel include/asm-generic/checksum.h + */ +static inline __sum16 +csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); +} + +static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, + u8 proto, u16 *udp_pkt) +{ + u32 csum = 0; + u32 cnt = 0; + + /* udp hdr and data */ + for (; cnt < len; cnt += 2) + csum += udp_pkt[cnt >> 1]; + + return csum_tcpudp_magic(saddr, daddr, len, proto, csum); +} + +#define ETH_FCS_SIZE 4 + +#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct udphdr)) + +#define PKT_SIZE (opt_pkt_size - ETH_FCS_SIZE) +#define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) +#define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) +#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) + +static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; + +static void gen_eth_hdr_data(void) +{ + struct udphdr *udp_hdr = (struct udphdr *)(pkt_data + + sizeof(struct ethhdr) + + sizeof(struct iphdr)); + struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + + sizeof(struct ethhdr)); + struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; + + /* ethernet header */ + memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN); + memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN); + eth_hdr->h_proto = htons(ETH_P_IP); + + /* IP header */ + ip_hdr->version = IPVERSION; + ip_hdr->ihl = 0x5; /* 20 byte header */ + ip_hdr->tos = 0x0; + ip_hdr->tot_len = htons(IP_PKT_SIZE); + ip_hdr->id = 0; + ip_hdr->frag_off = 0; + ip_hdr->ttl = IPDEFTTL; + ip_hdr->protocol = IPPROTO_UDP; + ip_hdr->saddr = htonl(0x0a0a0a10); + ip_hdr->daddr = htonl(0x0a0a0a20); + + /* IP header checksum */ + ip_hdr->check = 0; + ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl); + + /* UDP header */ + udp_hdr->source = htons(0x1000); + udp_hdr->dest = htons(0x1000); + udp_hdr->len = htons(UDP_PKT_SIZE); + + /* UDP data */ + memset32_htonl(pkt_data + PKT_HDR_SIZE, pkt_fill_pattern, + UDP_PKT_DATA_SIZE); + + /* UDP header checksum */ + udp_hdr->check = 0; + udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, + IPPROTO_UDP, (u16 *)udp_hdr); +} + static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) { memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, - sizeof(pkt_data) - 1); + PKT_SIZE); } static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) @@ -394,6 +629,7 @@ static struct option long_options[] = { {"duration", required_argument, 0, 'd'}, {"batch-size", required_argument, 0, 'b'}, {"tx-pkt-count", required_argument, 0, 'C'}, + {"tx-pkt-size", required_argument, 0, 's'}, {0, 0, 0, 0} }; @@ -424,9 +660,14 @@ static void usage(const char *prog) " packets. Default: %d\n" " -C, --tx-pkt-count=n Number of packets to send.\n" " Default: Continuous packets.\n" + " -s, --tx-pkt-size=n Transmit packet size.\n" + " (Default: %d bytes)\n" + " Min size: %d, Max size %d.\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, - opt_batch_size); + opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, + XSK_UMEM__DEFAULT_FRAME_SIZE); + exit(EXIT_FAILURE); } @@ -437,7 +678,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:", long_options, &option_index); if (c == -1) break; @@ -505,6 +746,16 @@ static void parse_command_line(int argc, char **argv) case 'C': opt_pkt_count = atoi(optarg); break; + case 's': + opt_pkt_size = atoi(optarg); + if (opt_pkt_size > (XSK_UMEM__DEFAULT_FRAME_SIZE) || + opt_pkt_size < MIN_PKT_SIZE) { + fprintf(stderr, + "ERROR: Invalid frame size %d\n", + opt_pkt_size); + usage(basename(argv[0])); + } + break; default: usage(basename(argv[0])); } @@ -679,7 +930,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb, int batch_size) struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); tx_desc->addr = (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; - tx_desc->len = sizeof(pkt_data) - 1; + tx_desc->len = PKT_SIZE; } xsk_ring_prod__submit(&xsk->tx, batch_size); @@ -916,9 +1167,12 @@ int main(int argc, char **argv) for (i = 0; i < opt_num_xsks; i++) xsks[num_socks++] = xsk_configure_socket(umem, rx, tx); - if (opt_bench == BENCH_TXONLY) + if (opt_bench == BENCH_TXONLY) { + gen_eth_hdr_data(); + for (i = 0; i < NUM_FRAMES; i++) gen_eth_frame(umem, i * opt_xsk_frame_size); + } if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY) enter_xsks_into_map(obj); From 46e3268eaaca9f8a0f145872b96fe6d54a232890 Mon Sep 17 00:00:00 2001 From: Jay Jayatheerthan Date: Fri, 20 Dec 2019 14:25:30 +0530 Subject: [PATCH 123/127] samples/bpf: xdpsock: Add option to specify transmit fill pattern The UDP payload fill pattern can be specified using '-P' or '--tx-pkt-pattern' option. It is an unsigned 32 bit field and defaulted to 0x12345678. The IP and UDP checksum is calculated by the code as per the content of the packet before transmission. Signed-off-by: Jay Jayatheerthan Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191220085530.4980-7-jay.jayatheerthan@intel.com --- samples/bpf/xdpsock_user.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 2297158a32bd..d74c4c83fc93 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -76,7 +76,7 @@ static bool benchmark_done; static u32 opt_batch_size = 64; static int opt_pkt_count; static u16 opt_pkt_size = MIN_PKT_SIZE; -static u32 pkt_fill_pattern = 0x12345678; +static u32 opt_pkt_fill_pattern = 0x12345678; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -517,7 +517,7 @@ static void gen_eth_hdr_data(void) udp_hdr->len = htons(UDP_PKT_SIZE); /* UDP data */ - memset32_htonl(pkt_data + PKT_HDR_SIZE, pkt_fill_pattern, + memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern, UDP_PKT_DATA_SIZE); /* UDP header checksum */ @@ -630,6 +630,7 @@ static struct option long_options[] = { {"batch-size", required_argument, 0, 'b'}, {"tx-pkt-count", required_argument, 0, 'C'}, {"tx-pkt-size", required_argument, 0, 's'}, + {"tx-pkt-pattern", required_argument, 0, 'P'}, {0, 0, 0, 0} }; @@ -663,10 +664,11 @@ static void usage(const char *prog) " -s, --tx-pkt-size=n Transmit packet size.\n" " (Default: %d bytes)\n" " Min size: %d, Max size %d.\n" + " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, - XSK_UMEM__DEFAULT_FRAME_SIZE); + XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern); exit(EXIT_FAILURE); } @@ -678,7 +680,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:", long_options, &option_index); if (c == -1) break; @@ -756,6 +758,9 @@ static void parse_command_line(int argc, char **argv) usage(basename(argv[0])); } break; + case 'P': + opt_pkt_fill_pattern = strtol(optarg, NULL, 16); + break; default: usage(basename(argv[0])); } From f9e6bfdbaf0cf304d72c70a05d81acac01a04f48 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 20 Dec 2019 17:19:36 +0100 Subject: [PATCH 124/127] samples/bpf: Xdp_redirect_cpu fix missing tracepoint attach When sample xdp_redirect_cpu was converted to use libbpf, the tracepoints used by this sample were not getting attached automatically like with bpf_load.c. The BPF-maps was still getting loaded, thus nobody notice that the tracepoints were not updating these maps. This fix doesn't use the new skeleton code, as this bug was introduced in v5.1 and stable might want to backport this. E.g. Red Hat QA uses this sample as part of their testing. Fixes: bbaf6029c49c ("samples/bpf: Convert XDP samples to libbpf usage") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/157685877642.26195.2798780195186786841.stgit@firesoul --- samples/bpf/xdp_redirect_cpu_user.c | 59 +++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 72af628529b5..79a2fb7d16cb 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -16,6 +16,10 @@ static const char *__doc__ = #include #include #include +#include + +#define __must_check +#include #include #include @@ -46,6 +50,10 @@ static int cpus_count_map_fd; static int cpus_iterator_map_fd; static int exception_cnt_map_fd; +#define NUM_TP 5 +struct bpf_link *tp_links[NUM_TP] = { 0 }; +static int tp_cnt = 0; + /* Exit return codes */ #define EXIT_OK 0 #define EXIT_FAIL 1 @@ -88,6 +96,10 @@ static void int_exit(int sig) printf("program on interface changed, not removing\n"); } } + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + exit(EXIT_OK); } @@ -588,23 +600,61 @@ static void stats_poll(int interval, bool use_separators, char *prog_name, free_stats_record(prev); } +static struct bpf_link * attach_tp(struct bpf_object *obj, + const char *tp_category, + const char* tp_name) +{ + struct bpf_program *prog; + struct bpf_link *link; + char sec_name[PATH_MAX]; + int len; + + len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s", + tp_category, tp_name); + if (len < 0) + exit(EXIT_FAIL); + + prog = bpf_object__find_program_by_title(obj, sec_name); + if (!prog) { + fprintf(stderr, "ERR: finding progsec: %s\n", sec_name); + exit(EXIT_FAIL_BPF); + } + + link = bpf_program__attach_tracepoint(prog, tp_category, tp_name); + if (IS_ERR(link)) + exit(EXIT_FAIL_BPF); + + return link; +} + +static void init_tracepoints(struct bpf_object *obj) { + tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err"); + tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err"); + tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception"); + tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue"); + tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread"); +} + static int init_map_fds(struct bpf_object *obj) { - cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map"); - rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt"); + /* Maps updated by tracepoints */ redirect_err_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt"); + exception_cnt_map_fd = + bpf_object__find_map_fd_by_name(obj, "exception_cnt"); cpumap_enqueue_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt"); cpumap_kthread_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt"); + + /* Maps used by XDP */ + rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt"); + cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map"); cpus_available_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_available"); cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count"); cpus_iterator_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_iterator"); - exception_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "exception_cnt"); if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 || redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 || @@ -662,6 +712,7 @@ int main(int argc, char **argv) strerror(errno)); return EXIT_FAIL; } + init_tracepoints(obj); if (init_map_fds(obj) < 0) { fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n"); return EXIT_FAIL; From 8ab9da573dad95536d8a92f19d7967e8142cd827 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 23 Dec 2019 10:03:05 -0800 Subject: [PATCH 125/127] libbpf: Support CO-RE relocations for LDX/ST/STX instructions Clang patch [0] enables emitting relocatable generic ALU/ALU64 instructions (i.e, shifts and arithmetic operations), as well as generic load/store instructions. The former ones are already supported by libbpf as is. This patch adds further support for load/store instructions. Relocatable field offset is encoded in BPF instruction's 16-bit offset section and are adjusted by libbpf based on target kernel BTF. These Clang changes and corresponding libbpf changes allow for more succinct generated BPF code by encoding relocatable field reads as a single ST/LDX/STX instruction. It also enables relocatable access to BPF context. Previously, if context struct (e.g., __sk_buff) was accessed with CO-RE relocations (e.g., due to preserve_access_index attribute), it would be rejected by BPF verifier due to modified context pointer dereference. With Clang patch, such context accesses are both relocatable and have a fixed offset from the point of view of BPF verifier. [0] https://reviews.llvm.org/D71790 Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191223180305.86417-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9576a90c5a1c..7513165b104f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -3810,11 +3811,13 @@ static int bpf_core_reloc_insn(struct bpf_program *prog, insn = &prog->insns[insn_idx]; class = BPF_CLASS(insn->code); - if (class == BPF_ALU || class == BPF_ALU64) { + switch (class) { + case BPF_ALU: + case BPF_ALU64: if (BPF_SRC(insn->code) != BPF_K) return -EINVAL; if (!failed && validate && insn->imm != orig_val) { - pr_warn("prog '%s': unexpected insn #%d value: got %u, exp %u -> %u\n", + pr_warn("prog '%s': unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n", bpf_program__title(prog, false), insn_idx, insn->imm, orig_val, new_val); return -EINVAL; @@ -3824,7 +3827,29 @@ static int bpf_core_reloc_insn(struct bpf_program *prog, pr_debug("prog '%s': patched insn #%d (ALU/ALU64)%s imm %u -> %u\n", bpf_program__title(prog, false), insn_idx, failed ? " w/ failed reloc" : "", orig_val, new_val); - } else { + break; + case BPF_LDX: + case BPF_ST: + case BPF_STX: + if (!failed && validate && insn->off != orig_val) { + pr_warn("prog '%s': unexpected insn #%d (LD/LDX/ST/STX) value: got %u, exp %u -> %u\n", + bpf_program__title(prog, false), insn_idx, + insn->off, orig_val, new_val); + return -EINVAL; + } + if (new_val > SHRT_MAX) { + pr_warn("prog '%s': insn #%d (LD/LDX/ST/STX) value too big: %u\n", + bpf_program__title(prog, false), insn_idx, + new_val); + return -ERANGE; + } + orig_val = insn->off; + insn->off = new_val; + pr_debug("prog '%s': patched insn #%d (LD/LDX/ST/STX)%s off %u -> %u\n", + bpf_program__title(prog, false), insn_idx, + failed ? " w/ failed reloc" : "", orig_val, new_val); + break; + default: pr_warn("prog '%s': trying to relocate unrecognized insn #%d, code:%x, src:%x, dst:%x, off:%x, imm:%x\n", bpf_program__title(prog, false), insn_idx, insn->code, insn->src_reg, insn->dst_reg, From 1162f844030ac1ac7321b5e8f6c9badc7a11428f Mon Sep 17 00:00:00 2001 From: Hechao Li Date: Mon, 23 Dec 2019 17:17:42 -0800 Subject: [PATCH 126/127] bpf: Print error message for bpftool cgroup show Currently, when bpftool cgroup show has an error, no error message is printed. This is confusing because the user may think the result is empty. Before the change: $ bpftool cgroup show /sys/fs/cgroup ID AttachType AttachFlags Name $ echo $? 255 After the change: $ ./bpftool cgroup show /sys/fs/cgroup Error: can't query bpf programs attached to /sys/fs/cgroup: Operation not permitted v2: Rename check_query_cgroup_progs to cgroup_has_attached_progs Signed-off-by: Hechao Li Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191224011742.3714301-1-hechaol@fb.com --- tools/bpf/bpftool/cgroup.c | 56 ++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 1ef45e55039e..2f017caa678d 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -117,6 +117,25 @@ static int count_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type) return prog_cnt; } +static int cgroup_has_attached_progs(int cgroup_fd) +{ + enum bpf_attach_type type; + bool no_prog = true; + + for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { + int count = count_attached_bpf_progs(cgroup_fd, type); + + if (count < 0 && errno != EINVAL) + return -1; + + if (count > 0) { + no_prog = false; + break; + } + } + + return no_prog ? 0 : 1; +} static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type, int level) { @@ -161,6 +180,7 @@ static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type, static int do_show(int argc, char **argv) { enum bpf_attach_type type; + int has_attached_progs; const char *path; int cgroup_fd; int ret = -1; @@ -192,6 +212,16 @@ static int do_show(int argc, char **argv) goto exit; } + has_attached_progs = cgroup_has_attached_progs(cgroup_fd); + if (has_attached_progs < 0) { + p_err("can't query bpf programs attached to %s: %s", + path, strerror(errno)); + goto exit_cgroup; + } else if (!has_attached_progs) { + ret = 0; + goto exit_cgroup; + } + if (json_output) jsonw_start_array(json_wtr); else @@ -212,6 +242,7 @@ static int do_show(int argc, char **argv) if (json_output) jsonw_end_array(json_wtr); +exit_cgroup: close(cgroup_fd); exit: return ret; @@ -228,7 +259,7 @@ static int do_show_tree_fn(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftw) { enum bpf_attach_type type; - bool skip = true; + int has_attached_progs; int cgroup_fd; if (typeflag != FTW_D) @@ -240,22 +271,13 @@ static int do_show_tree_fn(const char *fpath, const struct stat *sb, return SHOW_TREE_FN_ERR; } - for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { - int count = count_attached_bpf_progs(cgroup_fd, type); - - if (count < 0 && errno != EINVAL) { - p_err("can't query bpf programs attached to %s: %s", - fpath, strerror(errno)); - close(cgroup_fd); - return SHOW_TREE_FN_ERR; - } - if (count > 0) { - skip = false; - break; - } - } - - if (skip) { + has_attached_progs = cgroup_has_attached_progs(cgroup_fd); + if (has_attached_progs < 0) { + p_err("can't query bpf programs attached to %s: %s", + fpath, strerror(errno)); + close(cgroup_fd); + return SHOW_TREE_FN_ERR; + } else if (!has_attached_progs) { close(cgroup_fd); return 0; } From 7c8dce4b166113743adad131b5a24c4acc12f92c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 26 Dec 2019 13:02:53 -0800 Subject: [PATCH 127/127] bpftool: Make skeleton C code compilable with C++ compiler When auto-generated BPF skeleton C code is included from C++ application, it triggers compilation error due to void * being implicitly casted to whatever target pointer type. This is supported by C, but not C++. To solve this problem, add explicit casts, where necessary. To ensure issues like this are captured going forward, add skeleton usage in test_cpp test. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191226210253.3132060-1-andriin@fb.com --- tools/bpf/bpftool/gen.c | 10 +++++----- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/test_cpp.cpp | 10 ++++++++++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index a14d8bc5d31d..7ce09a9a6999 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -397,7 +397,7 @@ static int do_skeleton(int argc, char **argv) { \n\ struct %1$s *obj; \n\ \n\ - obj = calloc(1, sizeof(*obj)); \n\ + obj = (typeof(obj))calloc(1, sizeof(*obj)); \n\ if (!obj) \n\ return NULL; \n\ if (%1$s__create_skeleton(obj)) \n\ @@ -461,7 +461,7 @@ static int do_skeleton(int argc, char **argv) { \n\ struct bpf_object_skeleton *s; \n\ \n\ - s = calloc(1, sizeof(*s)); \n\ + s = (typeof(s))calloc(1, sizeof(*s)); \n\ if (!s) \n\ return -1; \n\ obj->skeleton = s; \n\ @@ -479,7 +479,7 @@ static int do_skeleton(int argc, char **argv) /* maps */ \n\ s->map_cnt = %zu; \n\ s->map_skel_sz = sizeof(*s->maps); \n\ - s->maps = calloc(s->map_cnt, s->map_skel_sz);\n\ + s->maps = (typeof(s->maps))calloc(s->map_cnt, s->map_skel_sz);\n\ if (!s->maps) \n\ goto err; \n\ ", @@ -515,7 +515,7 @@ static int do_skeleton(int argc, char **argv) /* programs */ \n\ s->prog_cnt = %zu; \n\ s->prog_skel_sz = sizeof(*s->progs); \n\ - s->progs = calloc(s->prog_cnt, s->prog_skel_sz);\n\ + s->progs = (typeof(s->progs))calloc(s->prog_cnt, s->prog_skel_sz);\n\ if (!s->progs) \n\ goto err; \n\ ", @@ -538,7 +538,7 @@ static int do_skeleton(int argc, char **argv) \n\ \n\ s->data_sz = %d; \n\ - s->data = \"\\ \n\ + s->data = (void *)\"\\ \n\ ", file_sz); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 866fc1cadd7c..41691fb067da 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -371,7 +371,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) $(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ # Make sure we are able to include and link libbpf against c++. -$(OUTPUT)/test_cpp: test_cpp.cpp $(BPFOBJ) +$(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg, CXX,,$@) $(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp index f0eb2727b766..6fe23a10d48a 100644 --- a/tools/testing/selftests/bpf/test_cpp.cpp +++ b/tools/testing/selftests/bpf/test_cpp.cpp @@ -1,12 +1,16 @@ /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#include #include "libbpf.h" #include "bpf.h" #include "btf.h" +#include "test_core_extern.skel.h" /* do nothing, just make sure we can link successfully */ int main(int argc, char *argv[]) { + struct test_core_extern *skel; + /* libbpf.h */ libbpf_set_print(NULL); @@ -16,5 +20,11 @@ int main(int argc, char *argv[]) /* btf.h */ btf__new(NULL, 0); + /* BPF skeleton */ + skel = test_core_extern__open_and_load(); + test_core_extern__destroy(skel); + + std::cout << "DONE!" << std::endl; + return 0; }