From 8f4a4160c618434e76f7d68673a6be50fac0549a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Sep 2019 13:43:24 +0200 Subject: [PATCH 01/19] x86/alternatives: Update int3_emulate_push() comment Update the comment now that we've merged x86_32 support. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132457.588386013@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/text-patching.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 23c626a742e8..ea91049058d8 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -85,6 +85,9 @@ static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of * this gap. See the idtentry macro's create_gap option. + * + * Similarly entry_32.S will have a gap on the stack for (any) hardware + * exception and pt_regs; see FIXUP_FRAME. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; From 18cbc8bed0c70795d2064217c89894e28eafdf04 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 26 Aug 2019 13:38:58 +0200 Subject: [PATCH 02/19] x86/alternatives, jump_label: Provide better text_poke() batching interface Adding another text_poke_bp_batch() user made me realize the interface is all sorts of wrong. The text poke vector should be internal to the implementation. This then results in a trivial interface: text_poke_queue() - which has the 'normal' text_poke_bp() interface text_poke_finish() - which takes no arguments and flushes any pending text_poke()s. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu Reviewed-by: Daniel Bristot de Oliveira Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Steven Rostedt Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132457.646280715@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/text-patching.h | 15 ++-- arch/x86/kernel/alternative.c | 64 +++++++++++++++-- arch/x86/kernel/jump_label.c | 101 ++++++++++----------------- 3 files changed, 97 insertions(+), 83 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index ea91049058d8..3bcd26623a1f 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -25,14 +25,6 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, */ #define POKE_MAX_OPCODE_SIZE 5 -struct text_poke_loc { - void *addr; - int len; - s32 rel32; - u8 opcode; - const u8 text[POKE_MAX_OPCODE_SIZE]; -}; - extern void text_poke_early(void *addr, const void *opcode, size_t len); /* @@ -53,9 +45,10 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries); -extern void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate); + +extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); +extern void text_poke_finish(void); + extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9ec463fe96f2..42e7f0af88da 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -936,6 +936,14 @@ static void do_sync_core(void *info) sync_core(); } +struct text_poke_loc { + void *addr; + int len; + s32 rel32; + u8 opcode; + const u8 text[POKE_MAX_OPCODE_SIZE]; +}; + static struct bp_patching_desc { struct text_poke_loc *vec; int nr_entries; @@ -1023,6 +1031,10 @@ int poke_int3_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(poke_int3_handler); +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) +static struct text_poke_loc tp_vec[TP_VEC_MAX]; +static int tp_vec_nr; + /** * text_poke_bp_batch() -- update instructions on live kernel on SMP * @tp: vector of instructions to patch @@ -1044,7 +1056,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); * replacing opcode * - sync cores */ -void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; @@ -1118,11 +1130,7 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, { struct insn insn; - if (!opcode) - opcode = (void *)tp->text; - else - memcpy((void *)tp->text, opcode, len); - + memcpy((void *)tp->text, opcode, len); if (!emulate) emulate = opcode; @@ -1167,6 +1175,50 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, } } +/* + * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * early if needed. + */ +static bool tp_order_fail(void *addr) +{ + struct text_poke_loc *tp; + + if (!tp_vec_nr) + return false; + + if (!addr) /* force */ + return true; + + tp = &tp_vec[tp_vec_nr - 1]; + if ((unsigned long)tp->addr > (unsigned long)addr) + return true; + + return false; +} + +static void text_poke_flush(void *addr) +{ + if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { + text_poke_bp_batch(tp_vec, tp_vec_nr); + tp_vec_nr = 0; + } +} + +void text_poke_finish(void) +{ + text_poke_flush(NULL); +} + +void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +{ + struct text_poke_loc *tp; + + text_poke_flush(addr); + + tp = &tp_vec[tp_vec_nr++]; + text_poke_loc_init(tp, addr, opcode, len, emulate); +} + /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index c1a8b9e71408..cf8c847c7b5d 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -35,18 +35,19 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __jump_label_set_jump_code(struct jump_entry *entry, - enum jump_label_type type, - union jump_code_union *code, - int init) +static const void * +__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init) { + static union jump_code_union code; /* relies on text_mutex */ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; const void *expect; int line; - code->jump = 0xe9; - code->offset = jump_entry_target(entry) - + lockdep_assert_held(&text_mutex); + + code.jump = JMP32_INSN_OPCODE; + code.offset = jump_entry_target(entry) - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); if (init) { @@ -54,23 +55,23 @@ static void __jump_label_set_jump_code(struct jump_entry *entry, } else if (type == JUMP_LABEL_JMP) { expect = ideal_nop; line = __LINE__; } else { - expect = code->code; line = __LINE__; + expect = code.code; line = __LINE__; } if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE)) bug_at((void *)jump_entry_code(entry), line); if (type == JUMP_LABEL_NOP) - memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE); + memcpy(&code, ideal_nop, JUMP_LABEL_NOP_SIZE); + + return &code; } -static void __ref __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static void inline __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) { - union jump_code_union code; - - __jump_label_set_jump_code(entry, type, &code, init); + const void *opcode = __jump_label_set_jump_code(entry, type, init); /* * As long as only a single processor is running and the code is still @@ -84,31 +85,33 @@ static void __ref __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { - text_poke_early((void *)jump_entry_code(entry), &code, + text_poke_early((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE); return; } - text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL); + text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL); +} + +static void __ref jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) +{ + mutex_lock(&text_mutex); + __jump_label_transform(entry, type, init); + mutex_unlock(&text_mutex); } void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - mutex_lock(&text_mutex); - __jump_label_transform(entry, type, 0); - mutex_unlock(&text_mutex); + jump_label_transform(entry, type, 0); } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; - bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { - struct text_poke_loc *tp; - void *entry_code; + const void *opcode; if (system_state == SYSTEM_BOOTING) { /* @@ -118,53 +121,19 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, return true; } - /* - * No more space in the vector, tell upper layer to apply - * the queue before continuing. - */ - if (tp_vec_nr == TP_VEC_MAX) - return false; - - tp = &tp_vec[tp_vec_nr]; - - entry_code = (void *)jump_entry_code(entry); - - /* - * The INT3 handler will do a bsearch in the queue, so we need entries - * to be sorted. We can survive an unsorted list by rejecting the entry, - * forcing the generic jump_label code to apply the queue. Warning once, - * to raise the attention to the case of an unsorted entry that is - * better not happen, because, in the worst case we will perform in the - * same way as we do without batching - with some more overhead. - */ - if (tp_vec_nr > 0) { - int prev = tp_vec_nr - 1; - struct text_poke_loc *prev_tp = &tp_vec[prev]; - - if (WARN_ON_ONCE(prev_tp->addr > entry_code)) - return false; - } - - __jump_label_set_jump_code(entry, type, - (union jump_code_union *)&tp->text, 0); - - text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL); - - tp_vec_nr++; - + mutex_lock(&text_mutex); + opcode = __jump_label_set_jump_code(entry, type, 0); + text_poke_queue((void *)jump_entry_code(entry), + opcode, JUMP_LABEL_NOP_SIZE, NULL); + mutex_unlock(&text_mutex); return true; } void arch_jump_label_transform_apply(void) { - if (!tp_vec_nr) - return; - mutex_lock(&text_mutex); - text_poke_bp_batch(tp_vec, tp_vec_nr); + text_poke_finish(); mutex_unlock(&text_mutex); - - tp_vec_nr = 0; } static enum { @@ -193,5 +162,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, jlstate = JL_STATE_NO_UPDATE; } if (jlstate == JL_STATE_UPDATE) - __jump_label_transform(entry, type, 1); + jump_label_transform(entry, type, 1); } From 63f62addb88ec4b358cf4574789bc3180c689e9a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Oct 2019 14:50:42 +0200 Subject: [PATCH 03/19] x86/alternatives: Add and use text_gen_insn() helper Provide a simple helper function to create common instruction encodings. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Daniel Bristot de Oliveira Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Steven Rostedt Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132457.703538332@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/text-patching.h | 2 ++ arch/x86/kernel/alternative.c | 36 ++++++++++++++++++++++++++++ arch/x86/kernel/jump_label.c | 31 +++++++++--------------- arch/x86/kernel/kprobes/opt.c | 7 +----- 4 files changed, 50 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 3bcd26623a1f..95beb85aef65 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -49,6 +49,8 @@ extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_finish(void); +extern void *text_gen_insn(u8 opcode, const void *addr, const void *dest); + extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 42e7f0af88da..714b4a2a6f81 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1237,3 +1237,39 @@ void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulat text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } + +union text_poke_insn { + u8 text[POKE_MAX_OPCODE_SIZE]; + struct { + u8 opcode; + s32 disp; + } __attribute__((packed)); +}; + +void *text_gen_insn(u8 opcode, const void *addr, const void *dest) +{ + static union text_poke_insn insn; /* text_mutex */ + int size = 0; + + lockdep_assert_held(&text_mutex); + + insn.opcode = opcode; + +#define __CASE(insn) \ + case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break + + switch(opcode) { + __CASE(INT3); + __CASE(CALL); + __CASE(JMP32); + __CASE(JMP8); + } + + if (size > 1) { + insn.disp = (long)dest - (long)(addr + size); + if (size == 2) + BUG_ON((insn.disp >> 31) != (insn.disp >> 7)); + } + + return &insn.text; +} diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index cf8c847c7b5d..9c4498ea0b3c 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -16,15 +16,7 @@ #include #include -union jump_code_union { - char code[JUMP_LABEL_NOP_SIZE]; - struct { - char jump; - int offset; - } __attribute__((packed)); -}; - -static void bug_at(unsigned char *ip, int line) +static void bug_at(const void *ip, int line) { /* * The location is not an op that we were expecting. @@ -38,33 +30,32 @@ static void bug_at(unsigned char *ip, int line) static const void * __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init) { - static union jump_code_union code; /* relies on text_mutex */ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - const void *expect; + const void *expect, *code; + const void *addr, *dest; int line; - lockdep_assert_held(&text_mutex); + addr = (void *)jump_entry_code(entry); + dest = (void *)jump_entry_target(entry); - code.jump = JMP32_INSN_OPCODE; - code.offset = jump_entry_target(entry) - - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); + code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); if (init) { expect = default_nop; line = __LINE__; } else if (type == JUMP_LABEL_JMP) { expect = ideal_nop; line = __LINE__; } else { - expect = code.code; line = __LINE__; + expect = code; line = __LINE__; } - if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE)) - bug_at((void *)jump_entry_code(entry), line); + if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) + bug_at(addr, line); if (type == JUMP_LABEL_NOP) - memcpy(&code, ideal_nop, JUMP_LABEL_NOP_SIZE); + code = ideal_nop; - return &code; + return code; } static void inline __jump_label_transform(struct jump_entry *entry, diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 8900329c28a7..9b01ee7b3923 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -447,18 +447,13 @@ void arch_optimize_kprobes(struct list_head *oplist) void arch_unoptimize_kprobe(struct optimized_kprobe *op) { u8 insn_buff[RELATIVEJUMP_SIZE]; - u8 emulate_buff[RELATIVEJUMP_SIZE]; /* Set int3 to first byte for kprobes */ insn_buff[0] = BREAKPOINT_INSTRUCTION; memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - emulate_buff[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - emulate_buff); + text_gen_insn(JMP32_INSN_OPCODE, op->kp.addr, op->optinsn.insn)); } /* From 768ae4406a5cab7e8702550f2446dbeb377b798d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 26 Aug 2019 14:48:23 +0200 Subject: [PATCH 04/19] x86/ftrace: Use text_poke() Move ftrace over to using the generic x86 text_poke functions; this avoids having a second/different copy of that code around. This also avoids ftrace violating the (new) W^X rule and avoids fragmenting the kernel text page-tables, due to no longer having to toggle them RW. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Daniel Bristot de Oliveira Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132457.761255803@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 2 - arch/x86/kernel/alternative.c | 18 +- arch/x86/kernel/ftrace.c | 681 +++++++--------------------------- arch/x86/kernel/traps.c | 9 - 4 files changed, 143 insertions(+), 567 deletions(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index c38a66661576..dbd9b08bf173 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -34,8 +34,6 @@ struct dyn_arch_ftrace { /* No extra data needed for x86 */ }; -int ftrace_int3_handler(struct pt_regs *regs); - #define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR #endif /* CONFIG_DYNAMIC_FTRACE */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 714b4a2a6f81..ce737f1da834 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -949,7 +949,7 @@ static struct bp_patching_desc { int nr_entries; } bp_patching; -static int patch_cmp(const void *key, const void *elt) +static int notrace patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; @@ -961,7 +961,7 @@ static int patch_cmp(const void *key, const void *elt) } NOKPROBE_SYMBOL(patch_cmp); -int poke_int3_handler(struct pt_regs *regs) +int notrace poke_int3_handler(struct pt_regs *regs) { struct text_poke_loc *tp; void *ip; @@ -1209,10 +1209,15 @@ void text_poke_finish(void) text_poke_flush(NULL); } -void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc *tp; + if (unlikely(system_state == SYSTEM_BOOTING)) { + text_poke_early(addr, opcode, len); + return; + } + text_poke_flush(addr); tp = &tp_vec[tp_vec_nr++]; @@ -1230,10 +1235,15 @@ void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emu * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ -void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc tp; + if (unlikely(system_state == SYSTEM_BOOTING)) { + text_poke_early(addr, opcode, len); + return; + } + text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 024c3053dbba..3d8adeba2651 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -34,6 +34,8 @@ #ifdef CONFIG_DYNAMIC_FTRACE +static int ftrace_poke_late = 0; + int ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { @@ -43,16 +45,20 @@ int ftrace_arch_code_modify_prepare(void) * ftrace has it set to "read/write". */ mutex_lock(&text_mutex); - set_kernel_text_rw(); - set_all_modules_text_rw(); + ftrace_poke_late = 1; return 0; } int ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) { - set_all_modules_text_ro(); - set_kernel_text_ro(); + /* + * ftrace_make_{call,nop}() may be called during + * module load, and we need to finish the text_poke_queue() + * that they do, here. + */ + text_poke_finish(); + ftrace_poke_late = 0; mutex_unlock(&text_mutex); return 0; } @@ -60,67 +66,34 @@ int ftrace_arch_code_modify_post_process(void) union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; struct { - unsigned char op; + char op; int offset; } __attribute__((packed)); }; -static int ftrace_calc_offset(long ip, long addr) -{ - return (int)(addr - ip); -} - -static unsigned char * -ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr) +static const char *ftrace_text_replace(char op, unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; - calc.op = op; - calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); + calc.op = op; + calc.offset = (int)(addr - (ip + MCOUNT_INSN_SIZE)); return calc.code; } -static unsigned char * -ftrace_call_replace(unsigned long ip, unsigned long addr) -{ - return ftrace_text_replace(0xe8, ip, addr); -} - -static inline int -within(unsigned long addr, unsigned long start, unsigned long end) -{ - return addr >= start && addr < end; -} - -static unsigned long text_ip_addr(unsigned long ip) -{ - /* - * On x86_64, kernel text mappings are mapped read-only, so we use - * the kernel identity mapping instead of the kernel text mapping - * to modify the kernel text. - * - * For 32bit kernels, these mappings are same and we can use - * kernel identity mapping to modify code. - */ - if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa_symbol(ip)); - - return ip; -} - -static const unsigned char *ftrace_nop_replace(void) +static const char *ftrace_nop_replace(void) { return ideal_nops[NOP_ATOMIC5]; } -static int -ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) +static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) { - unsigned char replaced[MCOUNT_INSN_SIZE]; + return ftrace_text_replace(CALL_INSN_OPCODE, ip, addr); +} - ftrace_expected = old_code; +static int ftrace_verify_code(unsigned long ip, const char *old_code) +{ + char cur_code[MCOUNT_INSN_SIZE]; /* * Note: @@ -129,31 +102,41 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, * Carefully read and modify the code with probe_kernel_*(), and make * sure what we read is what we expected it to be before modifying it. */ - /* read the text we want to modify */ - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) { + WARN_ON(1); return -EFAULT; + } /* Make sure it is what we expect it to be */ - if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) { + WARN_ON(1); return -EINVAL; - - ip = text_ip_addr(ip); - - /* replace the text with the new text */ - if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) - return -EPERM; - - sync_core(); + } return 0; } -int ftrace_make_nop(struct module *mod, - struct dyn_ftrace *rec, unsigned long addr) +static int +ftrace_modify_code_direct(unsigned long ip, const char *old_code, + const char *new_code) +{ + int ret = ftrace_verify_code(ip, old_code); + if (ret) + return ret; + + /* replace the text with the new text */ + if (ftrace_poke_late) + text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + else + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); + return 0; +} + +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned const char *new, *old; unsigned long ip = rec->ip; + const char *new, *old; old = ftrace_call_replace(ip, addr); new = ftrace_nop_replace(); @@ -167,19 +150,20 @@ int ftrace_make_nop(struct module *mod, * just modify the code directly. */ if (addr == MCOUNT_ADDR) - return ftrace_modify_code_direct(rec->ip, old, new); + return ftrace_modify_code_direct(ip, old, new); - ftrace_expected = NULL; - - /* Normal cases use add_brk_on_nop */ + /* + * x86 overrides ftrace_replace_code -- this function will never be used + * in this case. + */ WARN_ONCE(1, "invalid use of ftrace_make_nop"); return -EINVAL; } int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned const char *new, *old; unsigned long ip = rec->ip; + const char *new, *old; old = ftrace_nop_replace(); new = ftrace_call_replace(ip, addr); @@ -188,43 +172,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return ftrace_modify_code_direct(rec->ip, old, new); } -/* - * The modifying_ftrace_code is used to tell the breakpoint - * handler to call ftrace_int3_handler(). If it fails to - * call this handler for a breakpoint added by ftrace, then - * the kernel may crash. - * - * As atomic_writes on x86 do not need a barrier, we do not - * need to add smp_mb()s for this to work. It is also considered - * that we can not read the modifying_ftrace_code before - * executing the breakpoint. That would be quite remarkable if - * it could do that. Here's the flow that is required: - * - * CPU-0 CPU-1 - * - * atomic_inc(mfc); - * write int3s - * // implicit (r)mb - * if (atomic_read(mfc)) - * call ftrace_int3_handler() - * - * Then when we are finished: - * - * atomic_dec(mfc); - * - * If we hit a breakpoint that was not set by ftrace, it does not - * matter if ftrace_int3_handler() is called or not. It will - * simply be ignored. But it is crucial that a ftrace nop/caller - * breakpoint is handled. No other user should ever place a - * breakpoint on an ftrace nop/caller location. It must only - * be done by this code. - */ -atomic_t modifying_ftrace_code __read_mostly; - -static int -ftrace_modify_code(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code); - /* * Should never be called: * As it is only called by __ftrace_replace_code() which is called by @@ -237,452 +184,84 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { WARN_ON(1); - ftrace_expected = NULL; return -EINVAL; } -static unsigned long ftrace_update_func; -static unsigned long ftrace_update_func_call; - -static int update_ftrace_func(unsigned long ip, void *new) -{ - unsigned char old[MCOUNT_INSN_SIZE]; - int ret; - - memcpy(old, (void *)ip, MCOUNT_INSN_SIZE); - - ftrace_update_func = ip; - /* Make sure the breakpoints see the ftrace_update_func update */ - smp_wmb(); - - /* See comment above by declaration of modifying_ftrace_code */ - atomic_inc(&modifying_ftrace_code); - - ret = ftrace_modify_code(ip, old, new); - - atomic_dec(&modifying_ftrace_code); - - return ret; -} - int ftrace_update_ftrace_func(ftrace_func_t func) -{ - unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char *new; - int ret; - - ftrace_update_func_call = (unsigned long)func; - - new = ftrace_call_replace(ip, (unsigned long)func); - ret = update_ftrace_func(ip, new); - - /* Also update the regs callback function */ - if (!ret) { - ip = (unsigned long)(&ftrace_regs_call); - new = ftrace_call_replace(ip, (unsigned long)func); - ret = update_ftrace_func(ip, new); - } - - return ret; -} - -static nokprobe_inline int is_ftrace_caller(unsigned long ip) -{ - if (ip == ftrace_update_func) - return 1; - - return 0; -} - -/* - * A breakpoint was added to the code address we are about to - * modify, and this is the handle that will just skip over it. - * We are either changing a nop into a trace call, or a trace - * call to a nop. While the change is taking place, we treat - * it just like it was a nop. - */ -int ftrace_int3_handler(struct pt_regs *regs) { unsigned long ip; + const char *new; - if (WARN_ON_ONCE(!regs)) - return 0; + ip = (unsigned long)(&ftrace_call); + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); - ip = regs->ip - INT3_INSN_SIZE; - - if (ftrace_location(ip)) { - int3_emulate_call(regs, (unsigned long)ftrace_regs_caller); - return 1; - } else if (is_ftrace_caller(ip)) { - if (!ftrace_update_func_call) { - int3_emulate_jmp(regs, ip + CALL_INSN_SIZE); - return 1; - } - int3_emulate_call(regs, ftrace_update_func_call); - return 1; - } + ip = (unsigned long)(&ftrace_regs_call); + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } -NOKPROBE_SYMBOL(ftrace_int3_handler); - -static int ftrace_write(unsigned long ip, const char *val, int size) -{ - ip = text_ip_addr(ip); - - if (probe_kernel_write((void *)ip, val, size)) - return -EPERM; - - return 0; -} - -static int add_break(unsigned long ip, const char *old) -{ - unsigned char replaced[MCOUNT_INSN_SIZE]; - unsigned char brk = BREAKPOINT_INSTRUCTION; - - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; - - ftrace_expected = old; - - /* Make sure it is what we expect it to be */ - if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) - return -EINVAL; - - return ftrace_write(ip, &brk, 1); -} - -static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned const char *old; - unsigned long ip = rec->ip; - - old = ftrace_call_replace(ip, addr); - - return add_break(rec->ip, old); -} - - -static int add_brk_on_nop(struct dyn_ftrace *rec) -{ - unsigned const char *old; - - old = ftrace_nop_replace(); - - return add_break(rec->ip, old); -} - -static int add_breakpoints(struct dyn_ftrace *rec, bool enable) -{ - unsigned long ftrace_addr; - int ret; - - ftrace_addr = ftrace_get_addr_curr(rec); - - ret = ftrace_test_record(rec, enable); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return add_brk_on_nop(rec); - - case FTRACE_UPDATE_MODIFY_CALL: - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return add_brk_on_call(rec, ftrace_addr); - } - return 0; -} - -/* - * On error, we need to remove breakpoints. This needs to - * be done caefully. If the address does not currently have a - * breakpoint, we know we are done. Otherwise, we look at the - * remaining 4 bytes of the instruction. If it matches a nop - * we replace the breakpoint with the nop. Otherwise we replace - * it with the call instruction. - */ -static int remove_breakpoint(struct dyn_ftrace *rec) -{ - unsigned char ins[MCOUNT_INSN_SIZE]; - unsigned char brk = BREAKPOINT_INSTRUCTION; - const unsigned char *nop; - unsigned long ftrace_addr; - unsigned long ip = rec->ip; - - /* If we fail the read, just give up */ - if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; - - /* If this does not have a breakpoint, we are done */ - if (ins[0] != brk) - return 0; - - nop = ftrace_nop_replace(); - - /* - * If the last 4 bytes of the instruction do not match - * a nop, then we assume that this is a call to ftrace_addr. - */ - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { - /* - * For extra paranoidism, we check if the breakpoint is on - * a call that would actually jump to the ftrace_addr. - * If not, don't touch the breakpoint, we make just create - * a disaster. - */ - ftrace_addr = ftrace_get_addr_new(rec); - nop = ftrace_call_replace(ip, ftrace_addr); - - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) - goto update; - - /* Check both ftrace_addr and ftrace_old_addr */ - ftrace_addr = ftrace_get_addr_curr(rec); - nop = ftrace_call_replace(ip, ftrace_addr); - - ftrace_expected = nop; - - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) - return -EINVAL; - } - - update: - return ftrace_write(ip, nop, 1); -} - -static int add_update_code(unsigned long ip, unsigned const char *new) -{ - /* skip breakpoint */ - ip++; - new++; - return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1); -} - -static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_call_replace(ip, addr); - return add_update_code(ip, new); -} - -static int add_update_nop(struct dyn_ftrace *rec) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_nop_replace(); - return add_update_code(ip, new); -} - -static int add_update(struct dyn_ftrace *rec, bool enable) -{ - unsigned long ftrace_addr; - int ret; - - ret = ftrace_test_record(rec, enable); - - ftrace_addr = ftrace_get_addr_new(rec); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MODIFY_CALL: - case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return add_update_call(rec, ftrace_addr); - - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return add_update_nop(rec); - } - - return 0; -} - -static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_call_replace(ip, addr); - - return ftrace_write(ip, new, 1); -} - -static int finish_update_nop(struct dyn_ftrace *rec) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_nop_replace(); - - return ftrace_write(ip, new, 1); -} - -static int finish_update(struct dyn_ftrace *rec, bool enable) -{ - unsigned long ftrace_addr; - int ret; - - ret = ftrace_update_record(rec, enable); - - ftrace_addr = ftrace_get_addr_new(rec); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MODIFY_CALL: - case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return finish_update_call(rec, ftrace_addr); - - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return finish_update_nop(rec); - } - - return 0; -} - -static void do_sync_core(void *data) -{ - sync_core(); -} - -static void run_sync(void) -{ - int enable_irqs; - - /* No need to sync if there's only one CPU */ - if (num_online_cpus() == 1) - return; - - enable_irqs = irqs_disabled(); - - /* We may be called with interrupts disabled (on bootup). */ - if (enable_irqs) - local_irq_enable(); - on_each_cpu(do_sync_core, NULL, 1); - if (enable_irqs) - local_irq_disable(); -} void ftrace_replace_code(int enable) { struct ftrace_rec_iter *iter; struct dyn_ftrace *rec; - const char *report = "adding breakpoints"; - int count = 0; + const char *new, *old; int ret; for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); - ret = add_breakpoints(rec, enable); - if (ret) - goto remove_breakpoints; - count++; + switch (ftrace_test_record(rec, enable)) { + case FTRACE_UPDATE_IGNORE: + default: + continue; + + case FTRACE_UPDATE_MAKE_CALL: + old = ftrace_nop_replace(); + break; + + case FTRACE_UPDATE_MODIFY_CALL: + case FTRACE_UPDATE_MAKE_NOP: + old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec)); + break; + } + + ret = ftrace_verify_code(rec->ip, old); + if (ret) { + ftrace_bug(ret, rec); + return; + } } - run_sync(); - - report = "updating code"; - count = 0; - for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); - ret = add_update(rec, enable); - if (ret) - goto remove_breakpoints; - count++; + switch (ftrace_test_record(rec, enable)) { + case FTRACE_UPDATE_IGNORE: + default: + continue; + + case FTRACE_UPDATE_MAKE_CALL: + case FTRACE_UPDATE_MODIFY_CALL: + new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec)); + break; + + case FTRACE_UPDATE_MAKE_NOP: + new = ftrace_nop_replace(); + break; + } + + text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + ftrace_update_record(rec, enable); } - - run_sync(); - - report = "removing breakpoints"; - count = 0; - - for_ftrace_rec_iter(iter) { - rec = ftrace_rec_iter_record(iter); - - ret = finish_update(rec, enable); - if (ret) - goto remove_breakpoints; - count++; - } - - run_sync(); - - return; - - remove_breakpoints: - pr_warn("Failed on %s (%d):\n", report, count); - ftrace_bug(ret, rec); - for_ftrace_rec_iter(iter) { - rec = ftrace_rec_iter_record(iter); - /* - * Breakpoints are handled only when this function is in - * progress. The system could not work with them. - */ - if (remove_breakpoint(rec)) - BUG(); - } - run_sync(); -} - -static int -ftrace_modify_code(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) -{ - int ret; - - ret = add_break(ip, old_code); - if (ret) - goto out; - - run_sync(); - - ret = add_update_code(ip, new_code); - if (ret) - goto fail_update; - - run_sync(); - - ret = ftrace_write(ip, new_code, 1); - /* - * The breakpoint is handled only when this function is in progress. - * The system could not work if we could not remove it. - */ - BUG_ON(ret); - out: - run_sync(); - return ret; - - fail_update: - /* Also here the system could not work with the breakpoint */ - if (ftrace_write(ip, old_code, 1)) - BUG(); - goto out; + text_poke_finish(); } void arch_ftrace_update_code(int command) { - /* See comment above by declaration of modifying_ftrace_code */ - atomic_inc(&modifying_ftrace_code); - ftrace_modify_all_code(command); - - atomic_dec(&modifying_ftrace_code); } int __init ftrace_dyn_arch_init(void) @@ -747,6 +326,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long start_offset; unsigned long end_offset; unsigned long op_offset; + unsigned long call_offset; unsigned long offset; unsigned long npages; unsigned long size; @@ -763,10 +343,12 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) start_offset = (unsigned long)ftrace_regs_caller; end_offset = (unsigned long)ftrace_regs_caller_end; op_offset = (unsigned long)ftrace_regs_caller_op_ptr; + call_offset = (unsigned long)ftrace_regs_call; } else { start_offset = (unsigned long)ftrace_caller; end_offset = (unsigned long)ftrace_epilogue; op_offset = (unsigned long)ftrace_caller_op_ptr; + call_offset = (unsigned long)ftrace_call; } size = end_offset - start_offset; @@ -823,16 +405,21 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* put in the new offset to the ftrace_ops */ memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + /* put in the call to the function */ + mutex_lock(&text_mutex); + call_offset -= start_offset; + memcpy(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, + trampoline + call_offset, + ftrace_ops_get_func(ops)), CALL_INSN_SIZE); + mutex_unlock(&text_mutex); + /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; set_vm_flush_reset_perms(trampoline); - /* - * Module allocation needs to be completed by making the page - * executable. The page is still writable, which is a security hazard, - * but anyhow ftrace breaks W^X completely. - */ + set_memory_ro((unsigned long)trampoline, npages); set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: @@ -859,43 +446,35 @@ static unsigned long calc_trampoline_call_offset(bool save_regs) void arch_ftrace_update_trampoline(struct ftrace_ops *ops) { ftrace_func_t func; - unsigned char *new; unsigned long offset; unsigned long ip; unsigned int size; - int ret, npages; + const char *new; - if (ops->trampoline) { - /* - * The ftrace_ops caller may set up its own trampoline. - * In such a case, this code must not modify it. - */ - if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) - return; - npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT; - set_memory_rw(ops->trampoline, npages); - } else { + if (!ops->trampoline) { ops->trampoline = create_trampoline(ops, &size); if (!ops->trampoline) return; ops->trampoline_size = size; - npages = PAGE_ALIGN(size) >> PAGE_SHIFT; + return; } + /* + * The ftrace_ops caller may set up its own trampoline. + * In such a case, this code must not modify it. + */ + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); ip = ops->trampoline + offset; - func = ftrace_ops_get_func(ops); - ftrace_update_func_call = (unsigned long)func; - + mutex_lock(&text_mutex); /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); - ret = update_ftrace_func(ip, new); - set_memory_ro(ops->trampoline, npages); - - /* The update should never fail */ - WARN_ON(ret); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + mutex_unlock(&text_mutex); } /* Return the address of the function the trampoline calls */ @@ -981,19 +560,18 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE extern void ftrace_graph_call(void); -static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) +static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { - return ftrace_text_replace(0xe9, ip, addr); + return ftrace_text_replace(JMP32_INSN_OPCODE, ip, addr); } static int ftrace_mod_jmp(unsigned long ip, void *func) { - unsigned char *new; + const char *new; - ftrace_update_func_call = 0UL; new = ftrace_jmp_replace(ip, (unsigned long)func); - - return update_ftrace_func(ip, new); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + return 0; } int ftrace_enable_ftrace_graph_caller(void) @@ -1019,10 +597,9 @@ int ftrace_disable_ftrace_graph_caller(void) void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long frame_pointer) { + unsigned long return_hooker = (unsigned long)&return_to_handler; unsigned long old; int faulted; - unsigned long return_hooker = (unsigned long) - &return_to_handler; /* * When resuming from suspend-to-ram, this function can be indirectly diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c90312146da0..3451a004e162 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -563,15 +563,6 @@ NOKPROBE_SYMBOL(do_general_protection); dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * ftrace must be first, everything else may cause a recursive crash. - * See note by declaration of modifying_ftrace_code in ftrace.c - */ - if (unlikely(atomic_read(&modifying_ftrace_code)) && - ftrace_int3_handler(regs)) - return; -#endif if (poke_int3_handler(regs)) return; From c12af4407fa5a3fc6396bde379e0882a132df49b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Sep 2019 10:16:12 +0200 Subject: [PATCH 05/19] x86/mm: Remove set_kernel_text_r[ow]() With the last and only user of these functions gone (ftrace) remove them as well to avoid ever growing new users. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132457.819095320@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/set_memory.h | 2 -- arch/x86/mm/init_32.c | 28 ------------------------ arch/x86/mm/init_64.c | 36 ------------------------------- 3 files changed, 66 deletions(-) diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 2ee8e469dcf5..64c3dce374e5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -81,8 +81,6 @@ int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); extern int kernel_set_to_readonly; -void set_kernel_text_rw(void); -void set_kernel_text_ro(void); #ifdef CONFIG_X86_64 static inline int set_mce_nospec(unsigned long pfn) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 930edeb41ec3..e9f239e25bb0 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -874,34 +874,6 @@ void arch_remove_memory(int nid, u64 start, u64 size, int kernel_set_to_readonly __read_mostly; -void set_kernel_text_rw(void) -{ - unsigned long start = PFN_ALIGN(_text); - unsigned long size = PFN_ALIGN(_etext) - start; - - if (!kernel_set_to_readonly) - return; - - pr_debug("Set kernel text: %lx - %lx for read write\n", - start, start+size); - - set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); -} - -void set_kernel_text_ro(void) -{ - unsigned long start = PFN_ALIGN(_text); - unsigned long size = PFN_ALIGN(_etext) - start; - - if (!kernel_set_to_readonly) - return; - - pr_debug("Set kernel text: %lx - %lx for read only\n", - start, start+size); - - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); -} - static void mark_nxdata_nx(void) { /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dcb9bc961b39..b326f1440219 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1260,42 +1260,6 @@ void __init mem_init(void) int kernel_set_to_readonly; -void set_kernel_text_rw(void) -{ - unsigned long start = PFN_ALIGN(_text); - unsigned long end = PFN_ALIGN(_etext); - - if (!kernel_set_to_readonly) - return; - - pr_debug("Set kernel text: %lx - %lx for read write\n", - start, end); - - /* - * Make the kernel identity mapping for text RW. Kernel text - * mapping will always be RO. Refer to the comment in - * static_protections() in pageattr.c - */ - set_memory_rw(start, (end - start) >> PAGE_SHIFT); -} - -void set_kernel_text_ro(void) -{ - unsigned long start = PFN_ALIGN(_text); - unsigned long end = PFN_ALIGN(_etext); - - if (!kernel_set_to_readonly) - return; - - pr_debug("Set kernel text: %lx - %lx for read only\n", - start, end); - - /* - * Set the kernel identity mapping for text RO. - */ - set_memory_ro(start, (end - start) >> PAGE_SHIFT); -} - void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); From 254d2c04515ea4532a503cc5d8649e1513042e56 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Oct 2019 12:44:17 +0200 Subject: [PATCH 06/19] x86/alternative: Add text_opcode_size() Introduce a common helper to map *_INSN_OPCODE to *_INSN_SIZE. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132457.875666061@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/text-patching.h | 43 ++++++++++++++++++++-------- arch/x86/kernel/alternative.c | 12 +------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 95beb85aef65..93e4266cd5a3 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -49,18 +49,6 @@ extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_finish(void); -extern void *text_gen_insn(u8 opcode, const void *addr, const void *dest); - -extern int after_bootmem; -extern __ro_after_init struct mm_struct *poking_mm; -extern __ro_after_init unsigned long poking_addr; - -#ifndef CONFIG_UML_X86 -static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) -{ - regs->ip = ip; -} - #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC @@ -73,6 +61,37 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) #define JMP8_INSN_SIZE 2 #define JMP8_INSN_OPCODE 0xEB +static inline int text_opcode_size(u8 opcode) +{ + int size = 0; + +#define __CASE(insn) \ + case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break + + switch(opcode) { + __CASE(INT3); + __CASE(CALL); + __CASE(JMP32); + __CASE(JMP8); + } + +#undef __CASE + + return size; +} + +extern void *text_gen_insn(u8 opcode, const void *addr, const void *dest); + +extern int after_bootmem; +extern __ro_after_init struct mm_struct *poking_mm; +extern __ro_after_init unsigned long poking_addr; + +#ifndef CONFIG_UML_X86 +static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} + static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ce737f1da834..f8f34f94d13d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1259,22 +1259,12 @@ union text_poke_insn { void *text_gen_insn(u8 opcode, const void *addr, const void *dest) { static union text_poke_insn insn; /* text_mutex */ - int size = 0; + int size = text_opcode_size(opcode); lockdep_assert_held(&text_mutex); insn.opcode = opcode; -#define __CASE(insn) \ - case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break - - switch(opcode) { - __CASE(INT3); - __CASE(CALL); - __CASE(JMP32); - __CASE(JMP8); - } - if (size > 1) { insn.disp = (long)dest - (long)(addr + size); if (size == 2) From 67c1d4a28064f9ec63df03f7798e4a334176a9cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Oct 2019 12:44:14 +0200 Subject: [PATCH 07/19] x86/ftrace: Use text_gen_insn() Replace the ftrace_code_union with the generic text_gen_insn() helper, which does exactly this. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132457.932808000@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/text-patching.h | 30 +++++++++++++++++++++++++- arch/x86/kernel/alternative.c | 26 ---------------------- arch/x86/kernel/ftrace.c | 32 ++++++---------------------- 3 files changed, 36 insertions(+), 52 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 93e4266cd5a3..ad8f9f433a1b 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -80,7 +80,35 @@ static inline int text_opcode_size(u8 opcode) return size; } -extern void *text_gen_insn(u8 opcode, const void *addr, const void *dest); +union text_poke_insn { + u8 text[POKE_MAX_OPCODE_SIZE]; + struct { + u8 opcode; + s32 disp; + } __attribute__((packed)); +}; + +static __always_inline +void *text_gen_insn(u8 opcode, const void *addr, const void *dest) +{ + static union text_poke_insn insn; /* per instance */ + int size = text_opcode_size(opcode); + + insn.opcode = opcode; + + if (size > 1) { + insn.disp = (long)dest - (long)(addr + size); + if (size == 2) { + /* + * Ensure that for JMP9 the displacement + * actually fits the signed byte. + */ + BUG_ON((insn.disp >> 31) != (insn.disp >> 7)); + } + } + + return &insn.text; +} extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f8f34f94d13d..cfcfadf5cc80 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1247,29 +1247,3 @@ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void * text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } - -union text_poke_insn { - u8 text[POKE_MAX_OPCODE_SIZE]; - struct { - u8 opcode; - s32 disp; - } __attribute__((packed)); -}; - -void *text_gen_insn(u8 opcode, const void *addr, const void *dest) -{ - static union text_poke_insn insn; /* text_mutex */ - int size = text_opcode_size(opcode); - - lockdep_assert_held(&text_mutex); - - insn.opcode = opcode; - - if (size > 1) { - insn.disp = (long)dest - (long)(addr + size); - if (size == 2) - BUG_ON((insn.disp >> 31) != (insn.disp >> 7)); - } - - return &insn.text; -} diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3d8adeba2651..2a179fb35cd1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -63,24 +63,6 @@ int ftrace_arch_code_modify_post_process(void) return 0; } -union ftrace_code_union { - char code[MCOUNT_INSN_SIZE]; - struct { - char op; - int offset; - } __attribute__((packed)); -}; - -static const char *ftrace_text_replace(char op, unsigned long ip, unsigned long addr) -{ - static union ftrace_code_union calc; - - calc.op = op; - calc.offset = (int)(addr - (ip + MCOUNT_INSN_SIZE)); - - return calc.code; -} - static const char *ftrace_nop_replace(void) { return ideal_nops[NOP_ATOMIC5]; @@ -88,7 +70,7 @@ static const char *ftrace_nop_replace(void) static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) { - return ftrace_text_replace(CALL_INSN_OPCODE, ip, addr); + return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); } static int ftrace_verify_code(unsigned long ip, const char *old_code) @@ -480,20 +462,20 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) /* Return the address of the function the trampoline calls */ static void *addr_from_call(void *ptr) { - union ftrace_code_union calc; + union text_poke_insn call; int ret; - ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE); + ret = probe_kernel_read(&call, ptr, CALL_INSN_SIZE); if (WARN_ON_ONCE(ret < 0)) return NULL; /* Make sure this is a call */ - if (WARN_ON_ONCE(calc.op != 0xe8)) { - pr_warn("Expected e8, got %x\n", calc.op); + if (WARN_ON_ONCE(call.opcode != CALL_INSN_OPCODE)) { + pr_warn("Expected E8, got %x\n", call.opcode); return NULL; } - return ptr + MCOUNT_INSN_SIZE + calc.offset; + return ptr + CALL_INSN_SIZE + call.disp; } void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, @@ -562,7 +544,7 @@ extern void ftrace_graph_call(void); static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { - return ftrace_text_replace(JMP32_INSN_OPCODE, ip, addr); + return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr); } static int ftrace_mod_jmp(unsigned long ip, void *func) From 97e6c977ccf128c3f34d6084ad53fc0021f90e03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Oct 2019 12:44:20 +0200 Subject: [PATCH 08/19] x86/alternative: Remove text_poke_loc::len Per the BUG_ON(len != insn.length) in text_poke_loc_init(), tp->len must indeed be the same as text_opcode_size(tp->opcode). Use this to remove this field from the structure. Sadly, due to 8 byte alignment, this only increases the structure padding. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132457.989922744@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index cfcfadf5cc80..6e3ee73775f6 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -938,7 +938,6 @@ static void do_sync_core(void *info) struct text_poke_loc { void *addr; - int len; s32 rel32; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; @@ -965,6 +964,7 @@ int notrace poke_int3_handler(struct pt_regs *regs) { struct text_poke_loc *tp; void *ip; + int len; /* * Having observed our INT3 instruction, we now must observe @@ -1004,7 +1004,8 @@ int notrace poke_int3_handler(struct pt_regs *regs) return 0; } - ip += tp->len; + len = text_opcode_size(tp->opcode); + ip += len; switch (tp->opcode) { case INT3_INSN_OPCODE: @@ -1085,10 +1086,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { - if (tp[i].len - sizeof(int3) > 0) { + int len = text_opcode_size(tp[i].opcode); + + if (len - sizeof(int3) > 0) { text_poke((char *)tp[i].addr + sizeof(int3), (const char *)tp[i].text + sizeof(int3), - tp[i].len - sizeof(int3)); + len - sizeof(int3)); do_sync++; } } @@ -1141,7 +1144,6 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, BUG_ON(len != insn.length); tp->addr = addr; - tp->len = len; tp->opcode = insn.opcode.bytes[0]; switch (tp->opcode) { From 4531ef6a8aaf132aa32e2e26670c652942540633 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Oct 2019 12:26:53 +0200 Subject: [PATCH 09/19] x86/alternative: Shrink text_poke_loc Employ the fact that all text must be within a s32 displacement of one another to shrink the text_poke_loc::addr field. Make it relative to _stext. This then shrinks struct text_poke_loc to 16 bytes, and consequently increases TP_VEC_MAX from 170 to 256. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132458.047052889@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6e3ee73775f6..526cc5fb7314 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -937,7 +937,7 @@ static void do_sync_core(void *info) } struct text_poke_loc { - void *addr; + s32 rel_addr; /* addr := _stext + rel_addr */ s32 rel32; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; @@ -948,13 +948,18 @@ static struct bp_patching_desc { int nr_entries; } bp_patching; +static inline void *text_poke_addr(struct text_poke_loc *tp) +{ + return _stext + tp->rel_addr; +} + static int notrace patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; - if (key < tp->addr) + if (key < text_poke_addr(tp)) return -1; - if (key > tp->addr) + if (key > text_poke_addr(tp)) return 1; return 0; } @@ -1000,7 +1005,7 @@ int notrace poke_int3_handler(struct pt_regs *regs) return 0; } else { tp = bp_patching.vec; - if (tp->addr != ip) + if (text_poke_addr(tp) != ip) return 0; } @@ -1078,7 +1083,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * First step: add a int3 trap to the address that will be patched. */ for (i = 0; i < nr_entries; i++) - text_poke(tp[i].addr, &int3, sizeof(int3)); + text_poke(text_poke_addr(&tp[i]), &int3, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); @@ -1089,7 +1094,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries int len = text_opcode_size(tp[i].opcode); if (len - sizeof(int3) > 0) { - text_poke((char *)tp[i].addr + sizeof(int3), + text_poke(text_poke_addr(&tp[i]) + sizeof(int3), (const char *)tp[i].text + sizeof(int3), len - sizeof(int3)); do_sync++; @@ -1113,7 +1118,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries if (tp[i].text[0] == INT3_INSN_OPCODE) continue; - text_poke(tp[i].addr, tp[i].text, sizeof(int3)); + text_poke(text_poke_addr(&tp[i]), tp[i].text, sizeof(int3)); do_sync++; } @@ -1143,7 +1148,7 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, BUG_ON(!insn_complete(&insn)); BUG_ON(len != insn.length); - tp->addr = addr; + tp->rel_addr = addr - (void *)_stext; tp->opcode = insn.opcode.bytes[0]; switch (tp->opcode) { @@ -1192,7 +1197,7 @@ static bool tp_order_fail(void *addr) return true; tp = &tp_vec[tp_vec_nr - 1]; - if ((unsigned long)tp->addr > (unsigned long)addr) + if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) return true; return false; From 38ebd8d119245eecb99fe00b0f57e269baf22767 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 18 Nov 2019 18:20:12 +0100 Subject: [PATCH 10/19] x86/ftrace: Mark ftrace_modify_code_direct() __ref ... because it calls the .init.text function text_poke_early(). That is ok because it does call that function early, during boot. Fixes: 9706f7c3531f ("x86/ftrace: Use text_poke()") Suggested-by: Peter Zijlstra Signed-off-by: Borislav Petkov Acked-by: Steven Rostedt (VMware) Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Daniel Bristot de Oliveira Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191116204607.GC23231@zn.tnic --- arch/x86/kernel/ftrace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 2a179fb35cd1..108ee96f8b66 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -99,7 +99,12 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code) return 0; } -static int +/* + * Marked __ref because it calls text_poke_early() which is .init.text. That is + * ok because that call will happen early, during boot, when .init sections are + * still present. + */ +static int __ref ftrace_modify_code_direct(unsigned long ip, const char *old_code, const char *new_code) { From ab09e95ca0c697e67f986c4a45a179abf47e7bfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Oct 2019 13:57:17 +0200 Subject: [PATCH 11/19] x86/kprobes: Convert to text-patching.h Convert kprobes to the new text-poke naming. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132458.103959370@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kprobes.h | 14 ++++----- arch/x86/include/asm/text-patching.h | 2 ++ arch/x86/kernel/kprobes/core.c | 18 ++++++------ arch/x86/kernel/kprobes/opt.c | 44 +++++++++++++--------------- 4 files changed, 37 insertions(+), 41 deletions(-) diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 5dc909d9ad81..95b1f053bd96 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -11,12 +11,11 @@ #include -#define BREAKPOINT_INSTRUCTION 0xcc - #ifdef CONFIG_KPROBES #include #include #include +#include #include #define __ARCH_WANT_KPROBES_INSN_SLOT @@ -25,10 +24,7 @@ struct pt_regs; struct kprobe; typedef u8 kprobe_opcode_t; -#define RELATIVEJUMP_OPCODE 0xe9 -#define RELATIVEJUMP_SIZE 5 -#define RELATIVECALL_OPCODE 0xe8 -#define RELATIVE_ADDR_SIZE 4 + #define MAX_STACK_SIZE 64 #define CUR_STACK_SIZE(ADDR) \ (current_top_of_stack() - (unsigned long)(ADDR)) @@ -43,11 +39,11 @@ extern __visible kprobe_opcode_t optprobe_template_entry[]; extern __visible kprobe_opcode_t optprobe_template_val[]; extern __visible kprobe_opcode_t optprobe_template_call[]; extern __visible kprobe_opcode_t optprobe_template_end[]; -#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) +#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + DISP32_SIZE) #define MAX_OPTINSN_SIZE \ (((unsigned long)optprobe_template_end - \ (unsigned long)optprobe_template_entry) + \ - MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) + MAX_OPTIMIZED_LENGTH + JMP32_INSN_SIZE) extern const int kretprobe_blacklist_size; @@ -73,7 +69,7 @@ struct arch_specific_insn { struct arch_optimized_insn { /* copy of the original instructions */ - kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE]; + kprobe_opcode_t copied_insn[DISP32_SIZE]; /* detour code buffer */ kprobe_opcode_t *insn; /* the size of instructions copied to detour code buffer */ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index ad8f9f433a1b..4c09f4286b0c 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -61,6 +61,8 @@ extern void text_poke_finish(void); #define JMP8_INSN_SIZE 2 #define JMP8_INSN_OPCODE 0xEB +#define DISP32_SIZE 4 + static inline int text_opcode_size(u8 opcode) { int size = 0; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4f13af7cbcdb..697c059173bc 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -119,14 +119,14 @@ __synthesize_relative_insn(void *dest, void *from, void *to, u8 op) /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ void synthesize_reljump(void *dest, void *from, void *to) { - __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE); + __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE); } NOKPROBE_SYMBOL(synthesize_reljump); /* Insert a call instruction at address 'from', which calls address 'to'.*/ void synthesize_relcall(void *dest, void *from, void *to) { - __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE); + __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE); } NOKPROBE_SYMBOL(synthesize_relcall); @@ -301,7 +301,7 @@ static int can_probe(unsigned long paddr) * Another debugging subsystem might insert this breakpoint. * In that case, we can't recover it. */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) return 0; addr += insn.length; } @@ -356,7 +356,7 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) return 0; /* Another subsystem puts a breakpoint, failed to recover */ - if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn->opcode.bytes[0] == INT3_INSN_OPCODE) return 0; /* We should not singlestep on the exception masking instructions */ @@ -400,14 +400,14 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, int len = insn->length; if (can_boost(insn, p->addr) && - MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) { + MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) { /* * These instructions can be executed directly if it * jumps back to correct address. */ synthesize_reljump(buf + len, p->ainsn.insn + len, p->addr + insn->length); - len += RELATIVEJUMP_SIZE; + len += JMP32_INSN_SIZE; p->ainsn.boostable = true; } else { p->ainsn.boostable = false; @@ -501,7 +501,7 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_arm_kprobe(struct kprobe *p) { - text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); + text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); } void arch_disarm_kprobe(struct kprobe *p) @@ -609,7 +609,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, regs->flags |= X86_EFLAGS_TF; regs->flags &= ~X86_EFLAGS_IF; /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) + if (p->opcode == INT3_INSN_OPCODE) regs->ip = (unsigned long)p->addr; else regs->ip = (unsigned long)p->ainsn.insn; @@ -695,7 +695,7 @@ int kprobe_int3_handler(struct pt_regs *regs) reset_current_kprobe(); return 1; } - } else if (*addr != BREAKPOINT_INSTRUCTION) { + } else if (*addr != INT3_INSN_OPCODE) { /* * The breakpoint instruction was removed right * after we hit it. Another cpu has removed diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 9b01ee7b3923..0d9ea487d573 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -38,7 +38,7 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) long offs; int i; - for (i = 0; i < RELATIVEJUMP_SIZE; i++) { + for (i = 0; i < JMP32_INSN_SIZE; i++) { kp = get_kprobe((void *)addr - i); /* This function only handles jump-optimized kprobe */ if (kp && kprobe_optimized(kp)) { @@ -62,10 +62,10 @@ found: if (addr == (unsigned long)kp->addr) { buf[0] = kp->opcode; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE); } else { offs = addr - (unsigned long)kp->addr - 1; - memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); + memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs); } return (unsigned long)buf; @@ -141,8 +141,6 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func); #define TMPL_END_IDX \ ((long)optprobe_template_end - (long)optprobe_template_entry) -#define INT3_SIZE sizeof(kprobe_opcode_t) - /* Optimized kprobe call back function: called from optinsn */ static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) @@ -162,7 +160,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif - regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; regs->orig_ax = ~0UL; __this_cpu_write(current_kprobe, &op->kp); @@ -179,7 +177,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) struct insn insn; int len = 0, ret; - while (len < RELATIVEJUMP_SIZE) { + while (len < JMP32_INSN_SIZE) { ret = __copy_instruction(dest + len, src + len, real + len, &insn); if (!ret || !can_boost(&insn, src + len)) return -EINVAL; @@ -271,7 +269,7 @@ static int can_optimize(unsigned long paddr) return 0; /* Check there is enough space for a relative jump. */ - if (size - offset < RELATIVEJUMP_SIZE) + if (size - offset < JMP32_INSN_SIZE) return 0; /* Decode instructions */ @@ -290,15 +288,15 @@ static int can_optimize(unsigned long paddr) kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) return 0; /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); /* Check any instructions don't jump into target */ if (insn_is_indirect_jump(&insn) || - insn_jump_into_range(&insn, paddr + INT3_SIZE, - RELATIVE_ADDR_SIZE)) + insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE, + DISP32_SIZE)) return 0; addr += insn.length; } @@ -374,7 +372,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, * Verify if the address gap is in 2GB range, because this uses * a relative jump. */ - rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE; + rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE; if (abs(rel) > 0x7fffffff) { ret = -ERANGE; goto err; @@ -401,7 +399,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, /* Set returning jmp instruction at the tail of out-of-line buffer */ synthesize_reljump(buf + len, slot + len, (u8 *)op->kp.addr + op->optinsn.size); - len += RELATIVEJUMP_SIZE; + len += JMP32_INSN_SIZE; /* We have to use text_poke() for instruction buffer because it is RO */ text_poke(slot, buf, len); @@ -422,22 +420,22 @@ err: void arch_optimize_kprobes(struct list_head *oplist) { struct optimized_kprobe *op, *tmp; - u8 insn_buff[RELATIVEJUMP_SIZE]; + u8 insn_buff[JMP32_INSN_SIZE]; list_for_each_entry_safe(op, tmp, oplist, list) { s32 rel = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + ((long)op->kp.addr + JMP32_INSN_SIZE)); WARN_ON(kprobe_disabled(&op->kp)); /* Backup instructions which will be replaced by jump address */ - memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, - RELATIVE_ADDR_SIZE); + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE, + DISP32_SIZE); - insn_buff[0] = RELATIVEJUMP_OPCODE; + insn_buff[0] = JMP32_INSN_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL); + text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); list_del_init(&op->list); } @@ -446,13 +444,13 @@ void arch_optimize_kprobes(struct list_head *oplist) /* Replace a relative jump with a breakpoint (int3). */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - u8 insn_buff[RELATIVEJUMP_SIZE]; + u8 insn_buff[JMP32_INSN_SIZE]; /* Set int3 to first byte for kprobes */ - insn_buff[0] = BREAKPOINT_INSTRUCTION; - memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + insn_buff[0] = INT3_INSN_OPCODE; + memcpy(insn_buff + 1, op->optinsn.copied_insn, DISP32_SIZE); - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, + text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, text_gen_insn(JMP32_INSN_OPCODE, op->kp.addr, op->optinsn.insn)); } From 5c02ece81848db29b411139cc923d66050a6a40c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Oct 2019 21:15:28 +0200 Subject: [PATCH 12/19] x86/kprobes: Fix ordering while text-patching Kprobes does something like: register: arch_arm_kprobe() text_poke(INT3) /* guarantees nothing, INT3 will become visible at some point, maybe */ kprobe_optimizer() /* guarantees the bytes after INT3 are unused */ synchronize_rcu_tasks(); text_poke_bp(JMP32); /* implies IPI-sync, kprobe really is enabled */ unregister: __disarm_kprobe() unoptimize_kprobe() text_poke_bp(INT3 + tail); /* implies IPI-sync, so tail is guaranteed visible */ arch_disarm_kprobe() text_poke(old); /* guarantees nothing, old will maybe become visible */ synchronize_rcu() free-stuff Now the problem is that on register, the synchronize_rcu_tasks() does not imply sufficient to guarantee all CPUs have already observed INT3 (although in practice this is exceedingly unlikely not to have happened) (similar to how MEMBARRIER_CMD_PRIVATE_EXPEDITED does not imply MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE). Worse, even if it did, we'd have to do 2 synchronize calls to provide the guarantee we're looking for, the first to ensure INT3 is visible, the second to guarantee nobody is then still using the instruction bytes after INT3. Similar on unregister; the synchronize_rcu() between __unregister_kprobe_top() and __unregister_kprobe_bottom() does not guarantee all CPUs are free of the INT3 (and observe the old text). Therefore, sprinkle some IPI-sync love around. This guarantees that all CPUs agree on the text and RCU once again provides the required guaranteed. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Acked-by: Masami Hiramatsu Acked-by: Alexei Starovoitov Acked-by: Paul E. McKenney Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132458.162172862@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/text-patching.h | 1 + arch/x86/kernel/alternative.c | 11 ++++++++--- arch/x86/kernel/kprobes/core.c | 2 ++ arch/x86/kernel/kprobes/opt.c | 12 ++++-------- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 4c09f4286b0c..67315fa3956a 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -42,6 +42,7 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 526cc5fb7314..6455902e3b44 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -936,6 +936,11 @@ static void do_sync_core(void *info) sync_core(); } +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + struct text_poke_loc { s32 rel_addr; /* addr := _stext + rel_addr */ s32 rel32; @@ -1085,7 +1090,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries for (i = 0; i < nr_entries; i++) text_poke(text_poke_addr(&tp[i]), &int3, sizeof(int3)); - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); /* * Second step: update all but the first byte of the patched range. @@ -1107,7 +1112,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); } /* @@ -1123,7 +1128,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries } if (do_sync) - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); /* * sync_core() implies an smp_mb() and orders this store against diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 697c059173bc..579d30e91a36 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -502,11 +502,13 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_arm_kprobe(struct kprobe *p) { text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); + text_poke_sync(); } void arch_disarm_kprobe(struct kprobe *p) { text_poke(p->addr, &p->opcode, 1); + text_poke_sync(); } void arch_remove_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 0d9ea487d573..26e0d6c4d8d6 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -444,14 +444,10 @@ void arch_optimize_kprobes(struct list_head *oplist) /* Replace a relative jump with a breakpoint (int3). */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - u8 insn_buff[JMP32_INSN_SIZE]; - - /* Set int3 to first byte for kprobes */ - insn_buff[0] = INT3_INSN_OPCODE; - memcpy(insn_buff + 1, op->optinsn.copied_insn, DISP32_SIZE); - - text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, - text_gen_insn(JMP32_INSN_OPCODE, op->kp.addr, op->optinsn.insn)); + arch_arm_kprobe(&op->kp); + text_poke(op->kp.addr + INT3_INSN_SIZE, + op->optinsn.copied_insn, DISP32_SIZE); + text_poke_sync(); } /* From 5a735583b764750726621b0396d03e4782911b77 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Oct 2019 21:07:35 +0200 Subject: [PATCH 13/19] arm/ftrace: Use __patch_text() Instead of flipping text protection, use the patch_text infrastructure that uses a fixmap alias where required. This removes the last user of set_all_modules_text_*(). Tested-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: ard.biesheuvel@linaro.org Cc: james.morse@arm.com Cc: rabin@rab.in Link: https://lkml.kernel.org/r/20191113092636.GG4131@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/arm/kernel/Makefile | 4 ++-- arch/arm/kernel/ftrace.c | 10 ++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 8cad59465af3..a885172e504c 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -49,8 +49,8 @@ obj-$(CONFIG_HAVE_ARM_SCU) += smp_scu.o obj-$(CONFIG_HAVE_ARM_TWD) += smp_twd.o obj-$(CONFIG_ARM_ARCH_TIMER) += arch_timer.o obj-$(CONFIG_FUNCTION_TRACER) += entry-ftrace.o -obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o insn.o -obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o insn.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o insn.o patch.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o insn.o patch.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o insn.o patch.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o # Main staffs in KPROBES are in arch/arm/probes/ . diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index bda949fd84e8..2a5ff69c28e6 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #ifdef CONFIG_THUMB2_KERNEL #define NOP 0xf85deb04 /* pop.w {lr} */ @@ -35,9 +36,7 @@ static int __ftrace_modify_code(void *data) { int *command = data; - set_kernel_text_rw(); ftrace_modify_all_code(*command); - set_kernel_text_ro(); return 0; } @@ -59,13 +58,11 @@ static unsigned long adjust_address(struct dyn_ftrace *rec, unsigned long addr) int ftrace_arch_code_modify_prepare(void) { - set_all_modules_text_rw(); return 0; } int ftrace_arch_code_modify_post_process(void) { - set_all_modules_text_ro(); /* Make sure any TLB misses during machine stop are cleared. */ flush_tlb_all(); return 0; @@ -97,10 +94,7 @@ static int ftrace_modify_code(unsigned long pc, unsigned long old, return -EINVAL; } - if (probe_kernel_write((void *)pc, &new, MCOUNT_INSN_SIZE)) - return -EPERM; - - flush_icache_range(pc, pc + MCOUNT_INSN_SIZE); + __patch_text((void *)pc, new); return 0; } From 958de668197651bbf2b4b9528f204ab5a0f1af65 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Oct 2019 21:07:31 +0200 Subject: [PATCH 14/19] module: Remove set_all_modules_text_*() Now that there are no users of set_all_modules_text_*() left, remove it. While it appears nds32 uses it, it does not have STRICT_MODULE_RWX and therefore ends up with the NOP stubs. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Greentime Hu Cc: H. Peter Anvin Cc: Jessica Yu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Chen Link: https://lkml.kernel.org/r/20191111132458.284298307@infradead.org Signed-off-by: Ingo Molnar --- arch/nds32/kernel/ftrace.c | 12 ----------- include/linux/module.h | 4 ---- kernel/module.c | 43 -------------------------------------- 3 files changed, 59 deletions(-) diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c index fd2a54b8cd57..22ab77ea27ad 100644 --- a/arch/nds32/kernel/ftrace.c +++ b/arch/nds32/kernel/ftrace.c @@ -89,18 +89,6 @@ int __init ftrace_dyn_arch_init(void) return 0; } -int ftrace_arch_code_modify_prepare(void) -{ - set_all_modules_text_rw(); - return 0; -} - -int ftrace_arch_code_modify_post_process(void) -{ - set_all_modules_text_ro(); - return 0; -} - static unsigned long gen_sethi_insn(unsigned long addr) { unsigned long opcode = 0x46000000; diff --git a/include/linux/module.h b/include/linux/module.h index 6d20895e7739..daae84705040 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -846,13 +846,9 @@ extern int module_sysfs_initialized; #define __MODULE_STRING(x) __stringify(x) #ifdef CONFIG_STRICT_MODULE_RWX -extern void set_all_modules_text_rw(void); -extern void set_all_modules_text_ro(void); extern void module_enable_ro(const struct module *mod, bool after_init); extern void module_disable_ro(const struct module *mod); #else -static inline void set_all_modules_text_rw(void) { } -static inline void set_all_modules_text_ro(void) { } static inline void module_enable_ro(const struct module *mod, bool after_init) { } static inline void module_disable_ro(const struct module *mod) { } #endif diff --git a/kernel/module.c b/kernel/module.c index acf7962936c4..5cd9bed595f4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2029,49 +2029,6 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } -/* Iterate through all modules and set each module's text as RW */ -void set_all_modules_text_rw(void) -{ - struct module *mod; - - if (!rodata_enabled) - return; - - mutex_lock(&module_mutex); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - - frob_text(&mod->core_layout, set_memory_rw); - frob_text(&mod->init_layout, set_memory_rw); - } - mutex_unlock(&module_mutex); -} - -/* Iterate through all modules and set each module's text as RO */ -void set_all_modules_text_ro(void) -{ - struct module *mod; - - if (!rodata_enabled) - return; - - mutex_lock(&module_mutex); - list_for_each_entry_rcu(mod, &modules, list) { - /* - * Ignore going modules since it's possible that ro - * protection has already been disabled, otherwise we'll - * run into protection faults at module deallocation. - */ - if (mod->state == MODULE_STATE_UNFORMED || - mod->state == MODULE_STATE_GOING) - continue; - - frob_text(&mod->core_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - } - mutex_unlock(&module_mutex); -} #else /* !CONFIG_STRICT_MODULE_RWX */ static void module_enable_nx(const struct module *mod) { } #endif /* CONFIG_STRICT_MODULE_RWX */ From 04ae87a52074e2d448fc66143f1bd2c7d694d2b9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Oct 2019 22:26:59 +0200 Subject: [PATCH 15/19] ftrace: Rework event_create_dir() Rework event_create_dir() to use an array of static data instead of function pointers where possible. The problem is that it would call the function pointer on module load before parse_args(), possibly even before jump_labels were initialized. Luckily the generated functions don't use jump_labels but it still seems fragile. It also gets in the way of changing when we make the module map executable. The generated function are basically calling trace_define_field() with a bunch of static arguments. So instead of a function, capture these arguments in a static array, avoiding the function call. Now there are a number of cases where the fields are dynamic (syscall arguments, kprobes and uprobes), in which case a static array does not work, for these we preserve the function call. Luckily all these cases are not related to modules and so we can retain the function call for them. Also fix up all broken tracepoint definitions that now generate a compile error. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132458.342979914@infradead.org Signed-off-by: Ingo Molnar --- drivers/infiniband/hw/hfi1/trace_tid.h | 8 +- drivers/infiniband/hw/hfi1/trace_tx.h | 2 +- drivers/lightnvm/pblk-trace.h | 8 +- drivers/net/fjes/fjes_trace.h | 2 +- drivers/net/wireless/ath/ath10k/trace.h | 6 +- fs/xfs/scrub/trace.h | 6 +- fs/xfs/xfs_trace.h | 4 +- include/linux/trace_events.h | 18 +++- include/trace/events/filemap.h | 2 +- include/trace/events/rpcrdma.h | 2 +- include/trace/trace_events.h | 64 +++++--------- kernel/trace/trace.h | 31 ++++--- kernel/trace/trace_entries.h | 66 ++++----------- kernel/trace/trace_events.c | 20 ++++- kernel/trace/trace_events_hist.c | 8 +- kernel/trace/trace_export.c | 106 +++++++++--------------- kernel/trace/trace_kprobe.c | 16 +++- kernel/trace/trace_syscalls.c | 50 +++++------ kernel/trace/trace_uprobe.c | 9 +- net/mac80211/trace.h | 28 +++---- net/wireless/trace.h | 6 +- 21 files changed, 213 insertions(+), 249 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index 343fb9894a82..985ffa9cc958 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -138,10 +138,10 @@ TRACE_EVENT(/* put_tid */ TP_ARGS(dd, index, type, pa, order), TP_STRUCT__entry(/* entry */ DD_DEV_ENTRY(dd) - __field(unsigned long, pa); - __field(u32, index); - __field(u32, type); - __field(u16, order); + __field(unsigned long, pa) + __field(u32, index) + __field(u32, type) + __field(u16, order) ), TP_fast_assign(/* assign */ DD_DEV_ASSIGN(dd); diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index 09eb0c9ada00..769e5e4710c6 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -588,7 +588,7 @@ TRACE_EVENT(hfi1_sdma_user_reqinfo, TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i), TP_ARGS(dd, ctxt, subctxt, i), TP_STRUCT__entry( - DD_DEV_ENTRY(dd); + DD_DEV_ENTRY(dd) __field(u16, ctxt) __field(u8, subctxt) __field(u8, ver_opcode) diff --git a/drivers/lightnvm/pblk-trace.h b/drivers/lightnvm/pblk-trace.h index 9534503b69d9..47b67c6bff7a 100644 --- a/drivers/lightnvm/pblk-trace.h +++ b/drivers/lightnvm/pblk-trace.h @@ -46,7 +46,7 @@ TRACE_EVENT(pblk_chunk_reset, TP_STRUCT__entry( __string(name, name) __field(u64, ppa) - __field(int, state); + __field(int, state) ), TP_fast_assign( @@ -72,7 +72,7 @@ TRACE_EVENT(pblk_chunk_state, TP_STRUCT__entry( __string(name, name) __field(u64, ppa) - __field(int, state); + __field(int, state) ), TP_fast_assign( @@ -98,7 +98,7 @@ TRACE_EVENT(pblk_line_state, TP_STRUCT__entry( __string(name, name) __field(int, line) - __field(int, state); + __field(int, state) ), TP_fast_assign( @@ -121,7 +121,7 @@ TRACE_EVENT(pblk_state, TP_STRUCT__entry( __string(name, name) - __field(int, state); + __field(int, state) ), TP_fast_assign( diff --git a/drivers/net/fjes/fjes_trace.h b/drivers/net/fjes/fjes_trace.h index c611b6a80b20..9237b69d8e21 100644 --- a/drivers/net/fjes/fjes_trace.h +++ b/drivers/net/fjes/fjes_trace.h @@ -28,7 +28,7 @@ TRACE_EVENT(fjes_hw_issue_request_command, __field(u8, cs_busy) __field(u8, cs_complete) __field(int, timeout) - __field(int, ret); + __field(int, ret) ), TP_fast_assign( __entry->cr_req = cr->bits.req_code; diff --git a/drivers/net/wireless/ath/ath10k/trace.h b/drivers/net/wireless/ath/ath10k/trace.h index ab916459d237..842e42ec814f 100644 --- a/drivers/net/wireless/ath/ath10k/trace.h +++ b/drivers/net/wireless/ath/ath10k/trace.h @@ -239,7 +239,7 @@ TRACE_EVENT(ath10k_wmi_dbglog, TP_STRUCT__entry( __string(device, dev_name(ar->dev)) __string(driver, dev_driver_string(ar->dev)) - __field(u8, hw_type); + __field(u8, hw_type) __field(size_t, buf_len) __dynamic_array(u8, buf, buf_len) ), @@ -269,7 +269,7 @@ TRACE_EVENT(ath10k_htt_pktlog, TP_STRUCT__entry( __string(device, dev_name(ar->dev)) __string(driver, dev_driver_string(ar->dev)) - __field(u8, hw_type); + __field(u8, hw_type) __field(u16, buf_len) __dynamic_array(u8, pktlog, buf_len) ), @@ -435,7 +435,7 @@ TRACE_EVENT(ath10k_htt_rx_desc, TP_STRUCT__entry( __string(device, dev_name(ar->dev)) __string(driver, dev_driver_string(ar->dev)) - __field(u8, hw_type); + __field(u8, hw_type) __field(u16, len) __dynamic_array(u8, rxdesc, len) ), diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3362bae28b46..096203119934 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -329,7 +329,7 @@ TRACE_EVENT(xchk_btree_op_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(int, error) __field(void *, ret_ip) ), @@ -414,7 +414,7 @@ TRACE_EVENT(xchk_btree_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(void *, ret_ip) ), TP_fast_assign( @@ -452,7 +452,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(void *, ret_ip) ), TP_fast_assign( diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index eaae275ed430..53c5485cf2a1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -218,8 +218,8 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(void *, leaf); - __field(int, pos); + __field(void *, leaf) + __field(int, pos) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 30a8cdcfd4a4..a379255c14a9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -187,6 +187,22 @@ enum trace_reg { struct trace_event_call; +#define TRACE_FUNCTION_TYPE ((const char *)~0UL) + +struct trace_event_fields { + const char *type; + union { + struct { + const char *name; + const int size; + const int align; + const int is_signed; + const int filter_type; + }; + int (*define_fields)(struct trace_event_call *); + }; +}; + struct trace_event_class { const char *system; void *probe; @@ -195,7 +211,7 @@ struct trace_event_class { #endif int (*reg)(struct trace_event_call *event, enum trace_reg type, void *data); - int (*define_fields)(struct trace_event_call *); + struct trace_event_fields *fields_array; struct list_head *(*get_fields)(struct trace_event_call *); struct list_head fields; int (*raw_init)(struct trace_event_call *); diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h index ee05db7ee8d2..796053e162d2 100644 --- a/include/trace/events/filemap.h +++ b/include/trace/events/filemap.h @@ -85,7 +85,7 @@ TRACE_EVENT(file_check_and_advance_wb_err, TP_ARGS(file, old), TP_STRUCT__entry( - __field(struct file *, file); + __field(struct file *, file) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(errseq_t, old) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index a13830616107..477351897051 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1507,7 +1507,7 @@ TRACE_EVENT(svcrdma_dma_map_page, TP_ARGS(rdma, page), TP_STRUCT__entry( - __field(const void *, page); + __field(const void *, page) __string(device, rdma->sc_cm_id->device->name) __string(addr, rdma->sc_xprt.xpt_remotebuf) ), diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 4ecdfe2e3580..ca1d2e745a3f 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -394,22 +394,16 @@ static struct trace_event_functions trace_event_type_funcs_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) #undef __field_ext -#define __field_ext(type, item, filter_type) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; +#define __field_ext(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + .is_signed = is_signed_type(_type), .filter_type = _filter_type }, #undef __field_struct_ext -#define __field_struct_ext(type, item, filter_type) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - 0, filter_type); \ - if (ret) \ - return ret; +#define __field_struct_ext(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + 0, .filter_type = _filter_type }, #undef __field #define __field(type, item) __field_ext(type, item, FILTER_OTHER) @@ -418,25 +412,16 @@ static struct trace_event_functions trace_event_type_funcs_##call = { \ #define __field_struct(type, item) __field_struct_ext(type, item, FILTER_OTHER) #undef __array -#define __array(type, item, len) \ - do { \ - char *type_str = #type"["__stringify(len)"]"; \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - BUILD_BUG_ON(len <= 0); \ - ret = trace_define_field(event_call, type_str, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ - if (ret) \ - return ret; \ - } while (0); +#define __array(_type, _item, _len) { \ + .type = #_type"["__stringify(_len)"]", .name = #_item, \ + .size = sizeof(_type[_len]), .align = __alignof__(_type), \ + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }, #undef __dynamic_array -#define __dynamic_array(type, item, len) \ - ret = trace_define_field(event_call, "__data_loc " #type "[]", #item, \ - offsetof(typeof(field), __data_loc_##item), \ - sizeof(field.__data_loc_##item), \ - is_signed_type(type), FILTER_OTHER); +#define __dynamic_array(_type, _item, _len) { \ + .type = "__data_loc " #_type "[]", .name = #_item, \ + .size = 4, .align = 4, \ + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }, #undef __string #define __string(item, src) __dynamic_array(char, item, -1) @@ -446,16 +431,9 @@ static struct trace_event_functions trace_event_type_funcs_##call = { \ #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ -static int notrace __init \ -trace_event_define_fields_##call(struct trace_event_call *event_call) \ -{ \ - struct trace_event_raw_##call field; \ - int ret; \ - \ - tstruct; \ - \ - return ret; \ -} +static struct trace_event_fields trace_event_fields_##call[] = { \ + tstruct \ + {} }; #undef DEFINE_EVENT #define DEFINE_EVENT(template, name, proto, args) @@ -613,7 +591,7 @@ static inline notrace int trace_event_get_offsets_##call( \ * * static struct trace_event_class __used event_class_