From 4936e3528e3e272c567fe4ff0abb7ce3e1500575 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:26 +0200 Subject: [PATCH 01/10] bpf: minor cleanups in ebpf code Besides others, remove redundant comments where the code is self documenting enough, and properly indent various bpf_verifier_ops and bpf_prog_type_list declarations. Moreover, remove two exports that actually have no module user. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 2 -- net/core/filter.c | 34 +++++++++++++++------------------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d781b077431f..5313d09d4b62 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -129,14 +129,12 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, return fp; } -EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { kfree(fp->aux); vfree(fp); } -EXPORT_SYMBOL_GPL(__bpf_prog_free); #ifdef CONFIG_BPF_JIT struct bpf_binary_header * diff --git a/net/core/filter.c b/net/core/filter.c index 71c2a1f473ad..ea51b479cf02 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2069,16 +2069,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) static bool __is_valid_access(int off, int size, enum bpf_access_type type) { - /* check bounds */ if (off < 0 || off >= sizeof(struct __sk_buff)) return false; - - /* disallow misaligned access */ + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - - /* all __sk_buff fields are __u32 */ - if (size != 4) + if (size != sizeof(__u32)) return false; return true; @@ -2097,7 +2093,7 @@ static bool sk_filter_is_valid_access(int off, int size, if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): break; default: return false; @@ -2278,30 +2274,30 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, } static const struct bpf_verifier_ops sk_filter_ops = { - .get_func_proto = sk_filter_func_proto, - .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; static const struct bpf_verifier_ops tc_cls_act_ops = { - .get_func_proto = tc_cls_act_func_proto, - .is_valid_access = tc_cls_act_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .get_func_proto = tc_cls_act_func_proto, + .is_valid_access = tc_cls_act_is_valid_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; static struct bpf_prog_type_list sk_filter_type __read_mostly = { - .ops = &sk_filter_ops, - .type = BPF_PROG_TYPE_SOCKET_FILTER, + .ops = &sk_filter_ops, + .type = BPF_PROG_TYPE_SOCKET_FILTER, }; static struct bpf_prog_type_list sched_cls_type __read_mostly = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_CLS, + .ops = &tc_cls_act_ops, + .type = BPF_PROG_TYPE_SCHED_CLS, }; static struct bpf_prog_type_list sched_act_type __read_mostly = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_ACT, + .ops = &tc_cls_act_ops, + .type = BPF_PROG_TYPE_SCHED_ACT, }; static int __init register_sk_filter_ops(void) From c94987e40ebbae3b7b6c3ece37b6f8338830f6b1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:27 +0200 Subject: [PATCH 02/10] bpf: move bpf_jit_enable declaration Move the bpf_jit_enable declaration to the filter.h file where most other core code is declared, also since we're going to add a second knob there. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 2 ++ include/linux/netdevice.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index ec1411c89105..4ff0e647598f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -496,6 +496,8 @@ void bpf_int_jit_compile(struct bpf_prog *fp); bool bpf_helper_changes_skb_data(void *func); #ifdef CONFIG_BPF_JIT +extern int bpf_jit_enable; + typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); struct bpf_binary_header * diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c2f5112f08f7..c148edfe4965 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3759,7 +3759,6 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; extern int weight_p; -extern int bpf_jit_enable; bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, From 6077776b5908e0493a3946f7d3bc63871b201e87 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:28 +0200 Subject: [PATCH 03/10] bpf: split HAVE_BPF_JIT into cBPF and eBPF variant Split the HAVE_BPF_JIT into two for distinguishing cBPF and eBPF JITs. Current cBPF ones: # git grep -n HAVE_CBPF_JIT arch/ arch/arm/Kconfig:44: select HAVE_CBPF_JIT arch/mips/Kconfig:18: select HAVE_CBPF_JIT if !CPU_MICROMIPS arch/powerpc/Kconfig:129: select HAVE_CBPF_JIT arch/sparc/Kconfig:35: select HAVE_CBPF_JIT Current eBPF ones: # git grep -n HAVE_EBPF_JIT arch/ arch/arm64/Kconfig:61: select HAVE_EBPF_JIT arch/s390/Kconfig:126: select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES arch/x86/Kconfig:94: select HAVE_EBPF_JIT if X86_64 Later code also needs this facility to check for eBPF JITs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 2 +- arch/mips/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 2 +- net/Kconfig | 14 +++++++++++--- 8 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index cdfa6c2b7626..2315b0d1b4f4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -41,7 +41,7 @@ config ARM select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT) select HAVE_ARCH_TRACEHOOK select HAVE_ARM_SMCCC if CPU_V7 - select HAVE_BPF_JIT + select HAVE_CBPF_JIT select HAVE_CC_STACKPROTECTOR select HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4f436220384f..e6761ea2feec 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -58,7 +58,7 @@ config ARM64 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK - select HAVE_BPF_JIT + select HAVE_EBPF_JIT select HAVE_C_RECORDMCOUNT select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 2018c2b0e078..3ee1ea61b2dc 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -15,7 +15,7 @@ config MIPS select HAVE_ARCH_KGDB select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK - select HAVE_BPF_JIT if !CPU_MICROMIPS + select HAVE_CBPF_JIT if !CPU_MICROMIPS select HAVE_FUNCTION_TRACER select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7cd32c038286..2fdb73d9198a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -126,7 +126,7 @@ config PPC select IRQ_FORCED_THREADING select HAVE_RCU_TABLE_FREE if SMP select HAVE_SYSCALL_TRACEPOINTS - select HAVE_BPF_JIT + select HAVE_CBPF_JIT select HAVE_ARCH_JUMP_LABEL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf24ab188921..a883981c0174 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -126,7 +126,7 @@ config S390 select HAVE_ARCH_SOFT_DIRTY select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_BPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES + select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_DEBUG_KMEMLEAK diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 57ffaf285c2f..d5003812c748 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -32,7 +32,7 @@ config SPARC select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_PCI_IOMAP select HAVE_NMI_WATCHDOG if SPARC64 - select HAVE_BPF_JIT + select HAVE_CBPF_JIT select HAVE_DEBUG_BUGVERBOSE select GENERIC_SMP_IDLE_THREAD select GENERIC_CLOCKEVENTS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2dc18605831f..ae83046d51a8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -91,7 +91,7 @@ config X86 select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_BPF_JIT if X86_64 + select HAVE_EBPF_JIT if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL diff --git a/net/Kconfig b/net/Kconfig index b841c42e5c9b..f7148f24f114 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -289,7 +289,7 @@ config BQL config BPF_JIT bool "enable BPF Just In Time compiler" - depends on HAVE_BPF_JIT + depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT depends on MODULES ---help--- Berkeley Packet Filter filtering capabilities are normally handled @@ -419,6 +419,14 @@ config MAY_USE_DEVLINK endif # if NET -# Used by archs to tell that they support BPF_JIT -config HAVE_BPF_JIT +# Used by archs to tell that they support BPF JIT compiler plus which flavour. +# Only one of the two can be selected for a specific arch since eBPF JIT supersedes +# the cBPF JIT. + +# Classic BPF JIT (cBPF) +config HAVE_CBPF_JIT + bool + +# Extended BPF JIT (eBPF) +config HAVE_EBPF_JIT bool From 93a73d442d370e20ed1009cd79cb29c4d7c0ee86 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:29 +0200 Subject: [PATCH 04/10] bpf, x86/arm64: remove useless checks on prog There is never such a situation, where bpf_int_jit_compile() is called with either prog as NULL or len as 0, so the tests are unnecessary and confusing as people would just copy them. s390 doesn't have them, so no change is needed there. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm64/net/bpf_jit_comp.c | 3 --- arch/x86/net/bpf_jit_comp.c | 3 --- 2 files changed, 6 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index b405bbb54431..ef35e866caf7 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -772,9 +772,6 @@ void bpf_int_jit_compile(struct bpf_prog *prog) if (!bpf_jit_enable) return; - if (!prog || !prog->len) - return; - memset(&ctx, 0, sizeof(ctx)); ctx.prog = prog; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4286f3618bd0..f5bfd4fd28dd 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1086,9 +1086,6 @@ void bpf_int_jit_compile(struct bpf_prog *prog) if (!bpf_jit_enable) return; - if (!prog || !prog->len) - return; - addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); if (!addrs) return; From c237ee5eb33bf19fe0591c04ff8db19da7323a83 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:30 +0200 Subject: [PATCH 05/10] bpf: add bpf_patch_insn_single helper Move the functionality to patch instructions out of the verifier code and into the core as the new bpf_patch_insn_single() helper will be needed later on for blinding as well. No changes in functionality. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 3 ++ kernel/bpf/core.c | 71 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 53 ++++++------------------------- 3 files changed, 83 insertions(+), 44 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 4ff0e647598f..c4aae496f376 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -495,6 +495,9 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); void bpf_int_jit_compile(struct bpf_prog *fp); bool bpf_helper_changes_skb_data(void *func); +struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, + const struct bpf_insn *patch, u32 len); + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5313d09d4b62..49b5538a5301 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,77 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } +static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + /* Call and Exit are both special jumps with no + * target inside the BPF instruction image. + */ + BPF_OP(insn->code) != BPF_CALL && + BPF_OP(insn->code) != BPF_EXIT; +} + +static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) +{ + struct bpf_insn *insn = prog->insnsi; + u32 i, insn_cnt = prog->len; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_is_jmp_and_has_target(insn)) + continue; + + /* Adjust offset of jmps if we cross boundaries. */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) + insn->off -= delta; + } +} + +struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, + const struct bpf_insn *patch, u32 len) +{ + u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; + struct bpf_prog *prog_adj; + + /* Since our patchlet doesn't expand the image, we're done. */ + if (insn_delta == 0) { + memcpy(prog->insnsi + off, patch, sizeof(*patch)); + return prog; + } + + insn_adj_cnt = prog->len + insn_delta; + + /* Several new instructions need to be inserted. Make room + * for them. Likely, there's no need for a new allocation as + * last page could have large enough tailroom. + */ + prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), + GFP_USER); + if (!prog_adj) + return NULL; + + prog_adj->len = insn_adj_cnt; + + /* Patching happens in 3 steps: + * + * 1) Move over tail of insnsi from next instruction onwards, + * so we can patch the single target insn with one or more + * new ones (patching is always from 1 to n insns, n > 0). + * 2) Inject new instructions at the target location. + * 3) Adjust branch offsets if necessary. + */ + insn_rest = insn_adj_cnt - off - len; + + memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, + sizeof(*patch) * insn_rest); + memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); + + bpf_adj_branches(prog_adj, off, insn_delta); + + return prog_adj; +} + #ifdef CONFIG_BPF_JIT struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 84bff68cf80e..a08d66215245 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2587,26 +2587,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } -static void adjust_branches(struct bpf_prog *prog, int pos, int delta) -{ - struct bpf_insn *insn = prog->insnsi; - int insn_cnt = prog->len; - int i; - - for (i = 0; i < insn_cnt; i++, insn++) { - if (BPF_CLASS(insn->code) != BPF_JMP || - BPF_OP(insn->code) == BPF_CALL || - BPF_OP(insn->code) == BPF_EXIT) - continue; - - /* adjust offset of jmps if necessary */ - if (i < pos && i + insn->off + 1 > pos) - insn->off += delta; - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) - insn->off -= delta; - } -} - /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ @@ -2616,14 +2596,15 @@ static int convert_ctx_accesses(struct verifier_env *env) int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; - u32 cnt; - int i; enum bpf_access_type type; + int i; if (!env->prog->aux->ops->convert_ctx_access) return 0; for (i = 0; i < insn_cnt; i++, insn++) { + u32 insn_delta, cnt; + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) type = BPF_READ; else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) @@ -2645,34 +2626,18 @@ static int convert_ctx_accesses(struct verifier_env *env) return -EINVAL; } - if (cnt == 1) { - memcpy(insn, insn_buf, sizeof(*insn)); - continue; - } - - /* several new insns need to be inserted. Make room for them */ - insn_cnt += cnt - 1; - new_prog = bpf_prog_realloc(env->prog, - bpf_prog_size(insn_cnt), - GFP_USER); + new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); if (!new_prog) return -ENOMEM; - new_prog->len = insn_cnt; - - memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, - sizeof(*insn) * (insn_cnt - i - cnt)); - - /* copy substitute insns in place of load instruction */ - memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); - - /* adjust branches in the whole program */ - adjust_branches(new_prog, i, cnt - 1); + insn_delta = cnt - 1; /* keep walking new program and skip insns we just inserted */ env->prog = new_prog; - insn = new_prog->insnsi + i + cnt - 1; - i += cnt - 1; + insn = new_prog->insnsi + i + insn_delta; + + insn_cnt += insn_delta; + i += insn_delta; } return 0; From d1c55ab5e41fcd72cb0a8bef86d3f652ad9ad9f5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:31 +0200 Subject: [PATCH 06/10] bpf: prepare bpf_int_jit_compile/bpf_prog_select_runtime apis Since the blinding is strictly only called from inside eBPF JITs, we need to change signatures for bpf_int_jit_compile() and bpf_prog_select_runtime() first in order to prepare that the eBPF program we're dealing with can change underneath. Hence, for call sites, we need to return the latest prog. No functional change in this patch. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm64/net/bpf_jit_comp.c | 7 ++++--- arch/s390/net/bpf_jit_comp.c | 8 +++++--- arch/x86/net/bpf_jit_comp.c | 7 ++++--- include/linux/filter.h | 5 +++-- kernel/bpf/core.c | 18 ++++++++++++++---- kernel/bpf/syscall.c | 2 +- lib/test_bpf.c | 5 ++++- net/core/filter.c | 6 +++++- 8 files changed, 40 insertions(+), 18 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index ef35e866caf7..dd428807cb30 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -762,7 +762,7 @@ void bpf_jit_compile(struct bpf_prog *prog) /* Nothing to do here. We support Internal BPF. */ } -void bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header; struct jit_ctx ctx; @@ -770,14 +770,14 @@ void bpf_int_jit_compile(struct bpf_prog *prog) u8 *image_ptr; if (!bpf_jit_enable) - return; + return prog; memset(&ctx, 0, sizeof(ctx)); ctx.prog = prog; ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL); if (ctx.offset == NULL) - return; + return prog; /* 1. Initial fake pass to compute ctx->idx. */ @@ -828,6 +828,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) prog->jited = 1; out: kfree(ctx.offset); + return prog; } void bpf_jit_free(struct bpf_prog *prog) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 3c0bfc1f2694..fcf301a889e7 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1262,18 +1262,19 @@ void bpf_jit_compile(struct bpf_prog *fp) /* * Compile eBPF program "fp" */ -void bpf_int_jit_compile(struct bpf_prog *fp) +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { struct bpf_binary_header *header; struct bpf_jit jit; int pass; if (!bpf_jit_enable) - return; + return fp; + memset(&jit, 0, sizeof(jit)); jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); if (jit.addrs == NULL) - return; + return fp; /* * Three initial passes: * - 1/2: Determine clobbered registers @@ -1305,6 +1306,7 @@ void bpf_int_jit_compile(struct bpf_prog *fp) } free_addrs: kfree(jit.addrs); + return fp; } /* diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f5bfd4fd28dd..6b2d23ea3590 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1073,7 +1073,7 @@ void bpf_jit_compile(struct bpf_prog *prog) { } -void bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; int proglen, oldproglen = 0; @@ -1084,11 +1084,11 @@ void bpf_int_jit_compile(struct bpf_prog *prog) int i; if (!bpf_jit_enable) - return; + return prog; addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); if (!addrs) - return; + return prog; /* Before first pass, make a rough estimation of addrs[] * each bpf instruction is translated to less than 64 bytes @@ -1140,6 +1140,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) } out: kfree(addrs); + return prog; } void bpf_jit_free(struct bpf_prog *fp) diff --git a/include/linux/filter.h b/include/linux/filter.h index c4aae496f376..891852cf7716 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -458,7 +458,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) int sk_filter(struct sock *sk, struct sk_buff *skb); -int bpf_prog_select_runtime(struct bpf_prog *fp); +struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); @@ -492,7 +492,8 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -void bpf_int_jit_compile(struct bpf_prog *fp); + +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); bool bpf_helper_changes_skb_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 49b5538a5301..70f0821aca47 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -761,15 +761,22 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program + * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via BPF_PROG_RUN() macro. */ -int bpf_prog_select_runtime(struct bpf_prog *fp) +struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { fp->bpf_func = (void *) __bpf_prog_run; - bpf_int_jit_compile(fp); + /* eBPF JITs can rewrite the program in case constant + * blinding is active. However, in case of error during + * blinding, bpf_int_jit_compile() must always return a + * valid program, which in this case would simply not + * be JITed, but falls back to the interpreter. + */ + fp = bpf_int_jit_compile(fp); bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -777,7 +784,9 @@ int bpf_prog_select_runtime(struct bpf_prog *fp) * with JITed or non JITed program concatenations and not * all eBPF JITs might immediately support all features. */ - return bpf_check_tail_call(fp); + *err = bpf_check_tail_call(fp); + + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); @@ -859,8 +868,9 @@ const struct bpf_func_proto bpf_tail_call_proto = { }; /* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ -void __weak bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { + return prog; } bool __weak bpf_helper_changes_skb_data(void *func) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf5e9f7ad13a..46ecce4b79ed 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -762,7 +762,7 @@ static int bpf_prog_load(union bpf_attr *attr) fixup_bpf_calls(prog); /* eBPF program is ready to be JITed */ - err = bpf_prog_select_runtime(prog); + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 8f22fbedc3a6..93f45011a59d 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -5621,7 +5621,10 @@ static struct bpf_prog *generate_filter(int which, int *err) fp->type = BPF_PROG_TYPE_SOCKET_FILTER; memcpy(fp->insnsi, fptr, fp->len * sizeof(struct bpf_insn)); - bpf_prog_select_runtime(fp); + /* We cannot error here as we don't need type compatibility + * checks. + */ + fp = bpf_prog_select_runtime(fp, err); break; } diff --git a/net/core/filter.c b/net/core/filter.c index ea51b479cf02..68adb5f52110 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -994,7 +994,11 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) */ goto out_err_free; - bpf_prog_select_runtime(fp); + /* We are guaranteed to never error here with cBPF to eBPF + * transitions, since there's no issue with type compatibility + * checks on program arrays. + */ + fp = bpf_prog_select_runtime(fp, &err); kfree(old_prog); return fp; From 4f3446bb809f20ad56cadf712e6006815ae7a8f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:32 +0200 Subject: [PATCH 07/10] bpf: add generic constant blinding for use in jits This work adds a generic facility for use from eBPF JIT compilers that allows for further hardening of JIT generated images through blinding constants. In response to the original work on BPF JIT spraying published by Keegan McAllister [1], most BPF JITs were changed to make images read-only and start at a randomized offset in the page, where the rest was filled with trap instructions. We have this nowadays in x86, arm, arm64 and s390 JIT compilers. Additionally, later work also made eBPF interpreter images read only for kernels supporting DEBUG_SET_MODULE_RONX, that is, x86, arm, arm64 and s390 archs as well currently. This is done by default for mentioned JITs when JITing is enabled. Furthermore, we had a generic and configurable constant blinding facility on our todo for quite some time now to further make spraying harder, and first implementation since around netconf 2016. We found that for systems where untrusted users can load cBPF/eBPF code where JIT is enabled, start offset randomization helps a bit to make jumps into crafted payload harder, but in case where larger programs that cross page boundary are injected, we again have some part of the program opcodes at a page start offset. With improved guessing and more reliable payload injection, chances can increase to jump into such payload. Elena Reshetova recently wrote a test case for it [2, 3]. Moreover, eBPF comes with 64 bit constants, which can leave some more room for payloads. Note that for all this, additional bugs in the kernel are still required to make the jump (and of course to guess right, to not jump into a trap) and naturally the JIT must be enabled, which is disabled by default. For helping mitigation, the general idea is to provide an option bpf_jit_harden that admins can tweak along with bpf_jit_enable, so that for cases where JIT should be enabled for performance reasons, the generated image can be further hardened with blinding constants for unpriviledged users (bpf_jit_harden == 1), with trading off performance for these, but not for privileged ones. We also added the option of blinding for all users (bpf_jit_harden == 2), which is quite helpful for testing f.e. with test_bpf.ko. There are no further e.g. hardening levels of bpf_jit_harden switch intended, rationale is to have it dead simple to use as on/off. Since this functionality would need to be duplicated over and over for JIT compilers to use, which are already complex enough, we provide a generic eBPF byte-code level based blinding implementation, which is then just transparently JITed. JIT compilers need to make only a few changes to integrate this facility and can be migrated one by one. This option is for eBPF JITs and will be used in x86, arm64, s390 without too much effort, and soon ppc64 JITs, thus that native eBPF can be blinded as well as cBPF to eBPF migrations, so that both can be covered with a single implementation. The rule for JITs is that bpf_jit_blind_constants() must be called from bpf_int_jit_compile(), and in case blinding is disabled, we follow normally with JITing the passed program. In case blinding is enabled and we fail during the process of blinding itself, we must return with the interpreter. Similarly, in case the JITing process after the blinding failed, we return normally to the interpreter with the non-blinded code. Meaning, interpreter doesn't change in any way and operates on eBPF code as usual. For doing this pre-JIT blinding step, we need to make use of a helper/auxiliary register, here BPF_REG_AX. This is strictly internal to the JIT and not in any way part of the eBPF architecture. Just like in the same way as JITs internally make use of some helper registers when emitting code, only that here the helper register is one abstraction level higher in eBPF bytecode, but nevertheless in JIT phase. That helper register is needed since f.e. manually written program can issue loads to all registers of eBPF architecture. The core concept with the additional register is: blind out all 32 and 64 bit constants by converting BPF_K based instructions into a small sequence from K_VAL into ((RND ^ K_VAL) ^ RND). Therefore, this is transformed into: BPF_REG_AX := (RND ^ K_VAL), BPF_REG_AX ^= RND, and REG BPF_REG_AX, so actual operation on the target register is translated from BPF_K into BPF_X one that is operating on BPF_REG_AX's content. During rewriting phase when blinding, RND is newly generated via prandom_u32() for each processed instruction. 64 bit loads are split into two 32 bit loads to make translation and patching not too complex. Only basic thing required by JITs is to call the helper bpf_jit_blind_constants()/bpf_jit_prog_release_other() pair, and to map BPF_REG_AX into an unused register. Small bpf_jit_disasm extract from [2] when applied to x86 JIT: echo 0 > /proc/sys/net/core/bpf_jit_harden ffffffffa034f5e9 + : [...] 39: mov $0xa8909090,%eax 3e: mov $0xa8909090,%eax 43: mov $0xa8ff3148,%eax 48: mov $0xa89081b4,%eax 4d: mov $0xa8900bb0,%eax 52: mov $0xa810e0c1,%eax 57: mov $0xa8908eb4,%eax 5c: mov $0xa89020b0,%eax [...] echo 1 > /proc/sys/net/core/bpf_jit_harden ffffffffa034f1e5 + : [...] 39: mov $0xe1192563,%r10d 3f: xor $0x4989b5f3,%r10d 46: mov %r10d,%eax 49: mov $0xb8296d93,%r10d 4f: xor $0x10b9fd03,%r10d 56: mov %r10d,%eax 59: mov $0x8c381146,%r10d 5f: xor $0x24c7200e,%r10d 66: mov %r10d,%eax 69: mov $0xeb2a830e,%r10d 6f: xor $0x43ba02ba,%r10d 76: mov %r10d,%eax 79: mov $0xd9730af,%r10d 7f: xor $0xa5073b1f,%r10d 86: mov %r10d,%eax 89: mov $0x9a45662b,%r10d 8f: xor $0x325586ea,%r10d 96: mov %r10d,%eax [...] As can be seen, original constants that carry payload are hidden when enabled, actual operations are transformed from constant-based to register-based ones, making jumps into constants ineffective. Above extract/example uses single BPF load instruction over and over, but of course all instructions with constants are blinded. Performance wise, JIT with blinding performs a bit slower than just JIT and faster than interpreter case. This is expected, since we still get all the performance benefits from JITing and in normal use-cases not every single instruction needs to be blinded. Summing up all 296 test cases averaged over multiple runs from test_bpf.ko suite, interpreter was 55% slower than JIT only and JIT with blinding was 8% slower than JIT only. Since there are also some extremes in the test suite, I expect for ordinary workloads that the performance for the JIT with blinding case is even closer to JIT only case, f.e. nmap test case from suite has averaged timings in ns 29 (JIT), 35 (+ blinding), and 151 (interpreter). BPF test suite, seccomp test suite, eBPF sample code and various bigger networking eBPF programs have been tested with this and were running fine. For testing purposes, I also adapted interpreter and redirected blinded eBPF image to interpreter and also here all tests pass. [1] http://mainisusuallyafunction.blogspot.com/2012/11/attacking-hardened-linux-systems-with.html [2] https://github.com/01org/jit-spray-poc-for-ksp/ [3] http://www.openwall.com/lists/kernel-hardening/2016/05/03/5 Signed-off-by: Daniel Borkmann Reviewed-by: Elena Reshetova Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 11 ++ include/linux/filter.h | 42 ++++++++ kernel/bpf/core.c | 203 +++++++++++++++++++++++++++++++++++ net/Kconfig | 7 +- net/core/sysctl_net_core.c | 9 ++ 5 files changed, 270 insertions(+), 2 deletions(-) diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 809ab6efcc74..f0480f7ea740 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -43,6 +43,17 @@ Values : 1 - enable the JIT 2 - enable the JIT and ask the compiler to emit traces on kernel log. +bpf_jit_harden +-------------- + +This enables hardening for the Berkeley Packet Filter Just in Time compiler. +Supported are eBPF JIT backends. Enabling hardening trades off performance, +but can mitigate JIT spraying. +Values : + 0 - disable JIT hardening (default value) + 1 - enable JIT hardening for unprivileged users only + 2 - enable JIT hardening for all users + dev_weight -------------- diff --git a/include/linux/filter.h b/include/linux/filter.h index 891852cf7716..6fc31ef1da2d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -13,6 +13,8 @@ #include #include #include +#include + #include #include @@ -42,6 +44,15 @@ struct bpf_prog_aux; #define BPF_REG_X BPF_REG_7 #define BPF_REG_TMP BPF_REG_8 +/* Kernel hidden auxiliary/helper register for hardening step. + * Only used by eBPF JITs. It's nothing more than a temporary + * register that JITs use internally, only that here it's part + * of eBPF instructions that have been rewritten for blinding + * constants. See JIT pre-step in bpf_jit_blind_constants(). + */ +#define BPF_REG_AX MAX_BPF_REG +#define MAX_BPF_JIT_REG (MAX_BPF_REG + 1) + /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 @@ -501,6 +512,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; +extern int bpf_jit_harden; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); @@ -513,6 +525,9 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr); void bpf_jit_compile(struct bpf_prog *fp); void bpf_jit_free(struct bpf_prog *fp); +struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); +void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); + static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { @@ -523,6 +538,33 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, 16, 1, image, proglen, false); } + +static inline bool bpf_jit_is_ebpf(void) +{ +# ifdef CONFIG_HAVE_EBPF_JIT + return true; +# else + return false; +# endif +} + +static inline bool bpf_jit_blinding_enabled(void) +{ + /* These are the prerequisites, should someone ever have the + * idea to call blinding outside of them, we make sure to + * bail out. + */ + if (!bpf_jit_is_ebpf()) + return false; + if (!bpf_jit_enable) + return false; + if (!bpf_jit_harden) + return false; + if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN)) + return false; + + return true; +} #else static inline void bpf_jit_compile(struct bpf_prog *fp) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 70f0821aca47..f1e8a0def99b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -243,6 +243,209 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) { module_memfree(hdr); } + +int bpf_jit_harden __read_mostly; + +static int bpf_jit_blind_insn(const struct bpf_insn *from, + const struct bpf_insn *aux, + struct bpf_insn *to_buff) +{ + struct bpf_insn *to = to_buff; + u32 imm_rnd = prandom_u32(); + s16 off; + + BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + + if (from->imm == 0 && + (from->code == (BPF_ALU | BPF_MOV | BPF_K) || + from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { + *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg); + goto out; + } + + switch (from->code) { + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MOV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX); + break; + + case BPF_ALU64 | BPF_ADD | BPF_K: + case BPF_ALU64 | BPF_SUB | BPF_K: + case BPF_ALU64 | BPF_AND | BPF_K: + case BPF_ALU64 | BPF_OR | BPF_K: + case BPF_ALU64 | BPF_XOR | BPF_K: + case BPF_ALU64 | BPF_MUL | BPF_K: + case BPF_ALU64 | BPF_MOV | BPF_K: + case BPF_ALU64 | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_K: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX); + break; + + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + /* Accommodate for extra offset in case of a backjump. */ + off = from->off; + if (off < 0) + off -= 2; + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); + break; + + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); + break; + + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); + *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); + break; + + case BPF_LD | BPF_IMM | BPF_DW: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX); + break; + case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); + break; + + case BPF_ST | BPF_MEM | BPF_DW: + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); + break; + } +out: + return to - to_buff; +} + +static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + kmemcheck_annotate_bitfield(fp, meta); + + /* aux->prog still points to the fp_other one, so + * when promoting the clone to the real program, + * this still needs to be adapted. + */ + memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE); + } + + return fp; +} + +static void bpf_prog_clone_free(struct bpf_prog *fp) +{ + /* aux was stolen by the other clone, so we cannot free + * it from this path! It will be freed eventually by the + * other program on release. + * + * At this point, we don't need a deferred release since + * clone is guaranteed to not be locked. + */ + fp->aux = NULL; + __bpf_prog_free(fp); +} + +void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) +{ + /* We have to repoint aux->prog to self, as we don't + * know whether fp here is the clone or the original. + */ + fp->aux->prog = fp; + bpf_prog_clone_free(fp_other); +} + +struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) +{ + struct bpf_insn insn_buff[16], aux[2]; + struct bpf_prog *clone, *tmp; + int insn_delta, insn_cnt; + struct bpf_insn *insn; + int i, rewritten; + + if (!bpf_jit_blinding_enabled()) + return prog; + + clone = bpf_prog_clone_create(prog, GFP_USER); + if (!clone) + return ERR_PTR(-ENOMEM); + + insn_cnt = clone->len; + insn = clone->insnsi; + + for (i = 0; i < insn_cnt; i++, insn++) { + /* We temporarily need to hold the original ld64 insn + * so that we can still access the first part in the + * second blinding run. + */ + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) && + insn[1].code == 0) + memcpy(aux, insn, sizeof(aux)); + + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + if (!rewritten) + continue; + + tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); + if (!tmp) { + /* Patching may have repointed aux->prog during + * realloc from the original one, so we need to + * fix it up here on error. + */ + bpf_jit_prog_release_other(prog, clone); + return ERR_PTR(-ENOMEM); + } + + clone = tmp; + insn_delta = rewritten - 1; + + /* Walk new program and skip insns we just inserted. */ + insn = clone->insnsi + i + insn_delta; + insn_cnt += insn_delta; + i += insn_delta; + } + + return clone; +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, diff --git a/net/Kconfig b/net/Kconfig index f7148f24f114..ff40562a782c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -295,8 +295,11 @@ config BPF_JIT Berkeley Packet Filter filtering capabilities are normally handled by an interpreter. This option allows kernel to generate a native code when filter is loaded in memory. This should speedup - packet sniffing (libpcap/tcpdump). Note : Admin should enable - this feature changing /proc/sys/net/core/bpf_jit_enable + packet sniffing (libpcap/tcpdump). + + Note, admin should enable this feature changing: + /proc/sys/net/core/bpf_jit_enable + /proc/sys/net/core/bpf_jit_harden (optional) config NET_FLOW_LIMIT bool diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a6beb7b6ae55..0df2aa652530 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -294,6 +294,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +# ifdef CONFIG_HAVE_EBPF_JIT + { + .procname = "bpf_jit_harden", + .data = &bpf_jit_harden, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec, + }, +# endif #endif { .procname = "netdev_tstamp_prequeue", From 959a7579160349d222cc5da30db3b138139b6fbc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:33 +0200 Subject: [PATCH 08/10] bpf, x86: add support for constant blinding This patch adds recently added constant blinding helpers into the x86 eBPF JIT. In the bpf_int_jit_compile() path, requirements are to utilize bpf_jit_blind_constants()/bpf_jit_prog_release_other() pair for rewriting the program into a blinded one, and to map the BPF_REG_AX register to a CPU register. The mapping of BPF_REG_AX is at non-callee saved register r10, and thus shared with cached skb->data used for ld_abs/ind and not in every program type needed. When blinding is not used, there's zero additional overhead in the generated image. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 66 +++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6b2d23ea3590..fe04a04dab8e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -110,11 +110,16 @@ static void bpf_flush_icache(void *start, void *end) ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) /* pick a register outside of BPF range for JIT internal work */ -#define AUX_REG (MAX_BPF_REG + 1) +#define AUX_REG (MAX_BPF_JIT_REG + 1) -/* the following table maps BPF registers to x64 registers. - * x64 register r12 is unused, since if used as base address register - * in load/store instructions, it always needs an extra byte of encoding +/* The following table maps BPF registers to x64 registers. + * + * x64 register r12 is unused, since if used as base address + * register in load/store instructions, it always needs an + * extra byte of encoding and is callee saved. + * + * r9 caches skb->len - skb->data_len + * r10 caches skb->data, and used for blinding (if enabled) */ static const int reg2hex[] = { [BPF_REG_0] = 0, /* rax */ @@ -128,6 +133,7 @@ static const int reg2hex[] = { [BPF_REG_8] = 6, /* r14 callee saved */ [BPF_REG_9] = 7, /* r15 callee saved */ [BPF_REG_FP] = 5, /* rbp readonly */ + [BPF_REG_AX] = 2, /* r10 temp register */ [AUX_REG] = 3, /* r11 temp register */ }; @@ -141,7 +147,8 @@ static bool is_ereg(u32 reg) BIT(AUX_REG) | BIT(BPF_REG_7) | BIT(BPF_REG_8) | - BIT(BPF_REG_9)); + BIT(BPF_REG_9) | + BIT(BPF_REG_AX)); } /* add modifiers if 'reg' maps to x64 registers r8..r15 */ @@ -182,6 +189,7 @@ static void jit_fill_hole(void *area, unsigned int size) struct jit_context { int cleanup_addr; /* epilogue code offset */ bool seen_ld_abs; + bool seen_ax_reg; }; /* maximum number of bytes emitted while JITing one eBPF insn */ @@ -345,6 +353,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, struct bpf_insn *insn = bpf_prog->insnsi; int insn_cnt = bpf_prog->len; bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); + bool seen_ax_reg = ctx->seen_ax_reg | (oldproglen == 0); bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0; @@ -367,6 +376,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int ilen; u8 *func; + if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX) + ctx->seen_ax_reg = seen_ax_reg = true; + switch (insn->code) { /* ALU */ case BPF_ALU | BPF_ADD | BPF_X: @@ -1002,6 +1014,10 @@ common_load: * sk_load_* helpers also use %r10 and %r9d. * See bpf_jit.S */ + if (seen_ax_reg) + /* r10 = skb->data, mov %r10, off32(%rbx) */ + EMIT3_off32(0x4c, 0x8b, 0x93, + offsetof(struct sk_buff, data)); EMIT1_off32(0xE8, jmp_offset); /* call */ break; @@ -1076,19 +1092,34 @@ void bpf_jit_compile(struct bpf_prog *prog) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; + struct bpf_prog *tmp, *orig_prog = prog; int proglen, oldproglen = 0; struct jit_context ctx = {}; + bool tmp_blinded = false; u8 *image = NULL; int *addrs; int pass; int i; if (!bpf_jit_enable) - return prog; + return orig_prog; + + tmp = bpf_jit_blind_constants(prog); + /* If blinding was requested and we failed during blinding, + * we must fall back to the interpreter. + */ + if (IS_ERR(tmp)) + return orig_prog; + if (tmp != prog) { + tmp_blinded = true; + prog = tmp; + } addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); - if (!addrs) - return prog; + if (!addrs) { + prog = orig_prog; + goto out; + } /* Before first pass, make a rough estimation of addrs[] * each bpf instruction is translated to less than 64 bytes @@ -1110,21 +1141,25 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) image = NULL; if (header) bpf_jit_binary_free(header); - goto out; + prog = orig_prog; + goto out_addrs; } if (image) { if (proglen != oldproglen) { pr_err("bpf_jit: proglen=%d != oldproglen=%d\n", proglen, oldproglen); - goto out; + prog = orig_prog; + goto out_addrs; } break; } if (proglen == oldproglen) { header = bpf_jit_binary_alloc(proglen, &image, 1, jit_fill_hole); - if (!header) - goto out; + if (!header) { + prog = orig_prog; + goto out_addrs; + } } oldproglen = proglen; } @@ -1138,8 +1173,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog->bpf_func = (void *)image; prog->jited = 1; } -out: + +out_addrs: kfree(addrs); +out: + if (tmp_blinded) + bpf_jit_prog_release_other(prog, prog == orig_prog ? + tmp : orig_prog); return prog; } From 26eb042ee4c7845aa395c41c4e125c240b82b984 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:34 +0200 Subject: [PATCH 09/10] bpf, arm64: add support for constant blinding This patch adds recently added constant blinding helpers into the arm64 eBPF JIT. In the bpf_int_jit_compile() path, requirements are to utilize bpf_jit_blind_constants()/bpf_jit_prog_release_other() pair for rewriting the program into a blinded one, and to map the BPF_REG_AX register to a CPU register. The mapping is on x9. Signed-off-by: Daniel Borkmann Acked-by: Zi Shen Lim Acked-by: Yang Shi Tested-by: Yang Shi Signed-off-by: David S. Miller --- arch/arm64/net/bpf_jit_comp.c | 52 +++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index dd428807cb30..d0d51903c7e0 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -31,8 +31,8 @@ int bpf_jit_enable __read_mostly; -#define TMP_REG_1 (MAX_BPF_REG + 0) -#define TMP_REG_2 (MAX_BPF_REG + 1) +#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) +#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* Map BPF registers to A64 registers */ static const int bpf2a64[] = { @@ -54,6 +54,8 @@ static const int bpf2a64[] = { /* temporary register for internal BPF JIT */ [TMP_REG_1] = A64_R(23), [TMP_REG_2] = A64_R(24), + /* temporary register for blinding constants */ + [BPF_REG_AX] = A64_R(9), }; struct jit_ctx { @@ -764,26 +766,43 @@ void bpf_jit_compile(struct bpf_prog *prog) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { + struct bpf_prog *tmp, *orig_prog = prog; struct bpf_binary_header *header; + bool tmp_blinded = false; struct jit_ctx ctx; int image_size; u8 *image_ptr; if (!bpf_jit_enable) - return prog; + return orig_prog; + + tmp = bpf_jit_blind_constants(prog); + /* If blinding was requested and we failed during blinding, + * we must fall back to the interpreter. + */ + if (IS_ERR(tmp)) + return orig_prog; + if (tmp != prog) { + tmp_blinded = true; + prog = tmp; + } memset(&ctx, 0, sizeof(ctx)); ctx.prog = prog; ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL); - if (ctx.offset == NULL) - return prog; + if (ctx.offset == NULL) { + prog = orig_prog; + goto out; + } /* 1. Initial fake pass to compute ctx->idx. */ /* Fake pass to fill in ctx->offset and ctx->tmp_used. */ - if (build_body(&ctx)) - goto out; + if (build_body(&ctx)) { + prog = orig_prog; + goto out_off; + } build_prologue(&ctx); @@ -794,8 +813,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) image_size = sizeof(u32) * ctx.idx; header = bpf_jit_binary_alloc(image_size, &image_ptr, sizeof(u32), jit_fill_hole); - if (header == NULL) - goto out; + if (header == NULL) { + prog = orig_prog; + goto out_off; + } /* 2. Now, the actual pass. */ @@ -806,7 +827,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) if (build_body(&ctx)) { bpf_jit_binary_free(header); - goto out; + prog = orig_prog; + goto out_off; } build_epilogue(&ctx); @@ -814,7 +836,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* 3. Extra pass to validate JITed code. */ if (validate_code(&ctx)) { bpf_jit_binary_free(header); - goto out; + prog = orig_prog; + goto out_off; } /* And we're done. */ @@ -826,8 +849,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)ctx.image; prog->jited = 1; -out: + +out_off: kfree(ctx.offset); +out: + if (tmp_blinded) + bpf_jit_prog_release_other(prog, prog == orig_prog ? + tmp : orig_prog); return prog; } From d93a47f735f3455a896e46b18d0ac26fa19639e6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:35 +0200 Subject: [PATCH 10/10] bpf, s390: add support for constant blinding This patch adds recently added constant blinding helpers into the s390 eBPF JIT. In the bpf_int_jit_compile() path, requirements are to utilize bpf_jit_blind_constants()/bpf_jit_prog_release_other() pair for rewriting the program into a blinded one, and to map the BPF_REG_AX register to a CPU register. The mapping of BPF_REG_AX is at r12 and similarly like in x86 case performs reloading when ld_abs/ind is used. When blinding is not used, there's no additional overhead in the generated image. When BPF_REG_AX is used, we don't need to emit skb->data reload when helper function changed skb->data, as this will be reloaded later on anyway from stack on ld_abs/ind, where skb->data is needed. s390 allows for this w/o much additional complexity unlike f.e. x86. Signed-off-by: Daniel Borkmann Signed-off-by: Michael Holzheu Signed-off-by: David S. Miller --- arch/s390/net/bpf_jit_comp.c | 73 +++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index fcf301a889e7..9133b0ec000b 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -54,16 +54,17 @@ struct bpf_jit { #define SEEN_FUNC 16 /* calls C functions */ #define SEEN_TAIL_CALL 32 /* code uses tail calls */ #define SEEN_SKB_CHANGE 64 /* code changes skb data */ +#define SEEN_REG_AX 128 /* code uses constant blinding */ #define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB) /* * s390 registers */ -#define REG_W0 (__MAX_BPF_REG+0) /* Work register 1 (even) */ -#define REG_W1 (__MAX_BPF_REG+1) /* Work register 2 (odd) */ -#define REG_SKB_DATA (__MAX_BPF_REG+2) /* SKB data register */ -#define REG_L (__MAX_BPF_REG+3) /* Literal pool register */ -#define REG_15 (__MAX_BPF_REG+4) /* Register 15 */ +#define REG_W0 (MAX_BPF_JIT_REG + 0) /* Work register 1 (even) */ +#define REG_W1 (MAX_BPF_JIT_REG + 1) /* Work register 2 (odd) */ +#define REG_SKB_DATA (MAX_BPF_JIT_REG + 2) /* SKB data register */ +#define REG_L (MAX_BPF_JIT_REG + 3) /* Literal pool register */ +#define REG_15 (MAX_BPF_JIT_REG + 4) /* Register 15 */ #define REG_0 REG_W0 /* Register 0 */ #define REG_1 REG_W1 /* Register 1 */ #define REG_2 BPF_REG_1 /* Register 2 */ @@ -88,6 +89,8 @@ static const int reg2hex[] = { [BPF_REG_9] = 10, /* BPF stack pointer */ [BPF_REG_FP] = 13, + /* Register for blinding (shared with REG_SKB_DATA) */ + [BPF_REG_AX] = 12, /* SKB data pointer */ [REG_SKB_DATA] = 12, /* Work registers for s390x backend */ @@ -385,7 +388,7 @@ static void save_restore_regs(struct bpf_jit *jit, int op) /* * For SKB access %b1 contains the SKB pointer. For "bpf_jit.S" * we store the SKB header length on the stack and the SKB data - * pointer in REG_SKB_DATA. + * pointer in REG_SKB_DATA if BPF_REG_AX is not used. */ static void emit_load_skb_data_hlen(struct bpf_jit *jit) { @@ -397,9 +400,10 @@ static void emit_load_skb_data_hlen(struct bpf_jit *jit) offsetof(struct sk_buff, data_len)); /* stg %w1,ST_OFF_HLEN(%r0,%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, STK_OFF_HLEN); - /* lg %skb_data,data_off(%b1) */ - EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, - BPF_REG_1, offsetof(struct sk_buff, data)); + if (!(jit->seen & SEEN_REG_AX)) + /* lg %skb_data,data_off(%b1) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, + BPF_REG_1, offsetof(struct sk_buff, data)); } /* @@ -487,6 +491,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i s32 imm = insn->imm; s16 off = insn->off; + if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX) + jit->seen |= SEEN_REG_AX; switch (insn->code) { /* * BPF_MOV @@ -1188,7 +1194,7 @@ call_fn: /* * Implicit input: * BPF_REG_6 (R7) : skb pointer - * REG_SKB_DATA (R12): skb data pointer + * REG_SKB_DATA (R12): skb data pointer (if no BPF_REG_AX) * * Calculated input: * BPF_REG_2 (R3) : offset of byte(s) to fetch in skb @@ -1209,6 +1215,11 @@ call_fn: /* agfr %b2,%src (%src is s32 here) */ EMIT4(0xb9180000, BPF_REG_2, src_reg); + /* Reload REG_SKB_DATA if BPF_REG_AX is used */ + if (jit->seen & SEEN_REG_AX) + /* lg %skb_data,data_off(%b6) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, + BPF_REG_6, offsetof(struct sk_buff, data)); /* basr %b5,%w1 (%b5 is call saved) */ EMIT2(0x0d00, BPF_REG_5, REG_W1); @@ -1264,36 +1275,60 @@ void bpf_jit_compile(struct bpf_prog *fp) */ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { + struct bpf_prog *tmp, *orig_fp = fp; struct bpf_binary_header *header; + bool tmp_blinded = false; struct bpf_jit jit; int pass; if (!bpf_jit_enable) - return fp; + return orig_fp; + + tmp = bpf_jit_blind_constants(fp); + /* + * If blinding was requested and we failed during blinding, + * we must fall back to the interpreter. + */ + if (IS_ERR(tmp)) + return orig_fp; + if (tmp != fp) { + tmp_blinded = true; + fp = tmp; + } memset(&jit, 0, sizeof(jit)); jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); - if (jit.addrs == NULL) - return fp; + if (jit.addrs == NULL) { + fp = orig_fp; + goto out; + } /* * Three initial passes: * - 1/2: Determine clobbered registers * - 3: Calculate program size and addrs arrray */ for (pass = 1; pass <= 3; pass++) { - if (bpf_jit_prog(&jit, fp)) + if (bpf_jit_prog(&jit, fp)) { + fp = orig_fp; goto free_addrs; + } } /* * Final pass: Allocate and generate program */ - if (jit.size >= BPF_SIZE_MAX) + if (jit.size >= BPF_SIZE_MAX) { + fp = orig_fp; goto free_addrs; + } header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 2, jit_fill_hole); - if (!header) + if (!header) { + fp = orig_fp; goto free_addrs; - if (bpf_jit_prog(&jit, fp)) + } + if (bpf_jit_prog(&jit, fp)) { + fp = orig_fp; goto free_addrs; + } if (bpf_jit_enable > 1) { bpf_jit_dump(fp->len, jit.size, pass, jit.prg_buf); if (jit.prg_buf) @@ -1306,6 +1341,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) } free_addrs: kfree(jit.addrs); +out: + if (tmp_blinded) + bpf_jit_prog_release_other(fp, fp == orig_fp ? + tmp : orig_fp); return fp; }