mirror of
https://github.com/torvalds/linux.git
synced 2024-11-28 07:01:32 +00:00
5c02ece818
Kprobes does something like: register: arch_arm_kprobe() text_poke(INT3) /* guarantees nothing, INT3 will become visible at some point, maybe */ kprobe_optimizer() /* guarantees the bytes after INT3 are unused */ synchronize_rcu_tasks(); text_poke_bp(JMP32); /* implies IPI-sync, kprobe really is enabled */ unregister: __disarm_kprobe() unoptimize_kprobe() text_poke_bp(INT3 + tail); /* implies IPI-sync, so tail is guaranteed visible */ arch_disarm_kprobe() text_poke(old); /* guarantees nothing, old will maybe become visible */ synchronize_rcu() free-stuff Now the problem is that on register, the synchronize_rcu_tasks() does not imply sufficient to guarantee all CPUs have already observed INT3 (although in practice this is exceedingly unlikely not to have happened) (similar to how MEMBARRIER_CMD_PRIVATE_EXPEDITED does not imply MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE). Worse, even if it did, we'd have to do 2 synchronize calls to provide the guarantee we're looking for, the first to ensure INT3 is visible, the second to guarantee nobody is then still using the instruction bytes after INT3. Similar on unregister; the synchronize_rcu() between __unregister_kprobe_top() and __unregister_kprobe_bottom() does not guarantee all CPUs are free of the INT3 (and observe the old text). Therefore, sprinkle some IPI-sync love around. This guarantees that all CPUs agree on the text and RCU once again provides the required guaranteed. Tested-by: Alexei Starovoitov <ast@kernel.org> Tested-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Acked-by: Masami Hiramatsu <mhiramat@kernel.org> Acked-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Paul E. McKenney <paulmck@kernel.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: https://lkml.kernel.org/r/20191111132458.162172862@infradead.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
149 lines
3.9 KiB
C
149 lines
3.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_X86_TEXT_PATCHING_H
|
|
#define _ASM_X86_TEXT_PATCHING_H
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/stddef.h>
|
|
#include <asm/ptrace.h>
|
|
|
|
struct paravirt_patch_site;
|
|
#ifdef CONFIG_PARAVIRT
|
|
void apply_paravirt(struct paravirt_patch_site *start,
|
|
struct paravirt_patch_site *end);
|
|
#else
|
|
static inline void apply_paravirt(struct paravirt_patch_site *start,
|
|
struct paravirt_patch_site *end)
|
|
{}
|
|
#define __parainstructions NULL
|
|
#define __parainstructions_end NULL
|
|
#endif
|
|
|
|
/*
|
|
* Currently, the max observed size in the kernel code is
|
|
* JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
|
|
* Raise it if needed.
|
|
*/
|
|
#define POKE_MAX_OPCODE_SIZE 5
|
|
|
|
extern void text_poke_early(void *addr, const void *opcode, size_t len);
|
|
|
|
/*
|
|
* Clear and restore the kernel write-protection flag on the local CPU.
|
|
* Allows the kernel to edit read-only pages.
|
|
* Side-effect: any interrupt handler running between save and restore will have
|
|
* the ability to write to read-only pages.
|
|
*
|
|
* Warning:
|
|
* Code patching in the UP case is safe if NMIs and MCE handlers are stopped and
|
|
* no thread can be preempted in the instructions being modified (no iret to an
|
|
* invalid instruction possible) or if the instructions are changed from a
|
|
* consistent state to another consistent state atomically.
|
|
* On the local CPU you need to be protected against NMI or MCE handlers seeing
|
|
* an inconsistent instruction while you patch.
|
|
*/
|
|
extern void *text_poke(void *addr, const void *opcode, size_t len);
|
|
extern void text_poke_sync(void);
|
|
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
|
|
extern int poke_int3_handler(struct pt_regs *regs);
|
|
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
|
|
|
|
extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate);
|
|
extern void text_poke_finish(void);
|
|
|
|
#define INT3_INSN_SIZE 1
|
|
#define INT3_INSN_OPCODE 0xCC
|
|
|
|
#define CALL_INSN_SIZE 5
|
|
#define CALL_INSN_OPCODE 0xE8
|
|
|
|
#define JMP32_INSN_SIZE 5
|
|
#define JMP32_INSN_OPCODE 0xE9
|
|
|
|
#define JMP8_INSN_SIZE 2
|
|
#define JMP8_INSN_OPCODE 0xEB
|
|
|
|
#define DISP32_SIZE 4
|
|
|
|
static inline int text_opcode_size(u8 opcode)
|
|
{
|
|
int size = 0;
|
|
|
|
#define __CASE(insn) \
|
|
case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break
|
|
|
|
switch(opcode) {
|
|
__CASE(INT3);
|
|
__CASE(CALL);
|
|
__CASE(JMP32);
|
|
__CASE(JMP8);
|
|
}
|
|
|
|
#undef __CASE
|
|
|
|
return size;
|
|
}
|
|
|
|
union text_poke_insn {
|
|
u8 text[POKE_MAX_OPCODE_SIZE];
|
|
struct {
|
|
u8 opcode;
|
|
s32 disp;
|
|
} __attribute__((packed));
|
|
};
|
|
|
|
static __always_inline
|
|
void *text_gen_insn(u8 opcode, const void *addr, const void *dest)
|
|
{
|
|
static union text_poke_insn insn; /* per instance */
|
|
int size = text_opcode_size(opcode);
|
|
|
|
insn.opcode = opcode;
|
|
|
|
if (size > 1) {
|
|
insn.disp = (long)dest - (long)(addr + size);
|
|
if (size == 2) {
|
|
/*
|
|
* Ensure that for JMP9 the displacement
|
|
* actually fits the signed byte.
|
|
*/
|
|
BUG_ON((insn.disp >> 31) != (insn.disp >> 7));
|
|
}
|
|
}
|
|
|
|
return &insn.text;
|
|
}
|
|
|
|
extern int after_bootmem;
|
|
extern __ro_after_init struct mm_struct *poking_mm;
|
|
extern __ro_after_init unsigned long poking_addr;
|
|
|
|
#ifndef CONFIG_UML_X86
|
|
static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
|
|
{
|
|
regs->ip = ip;
|
|
}
|
|
|
|
static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
|
|
{
|
|
/*
|
|
* The int3 handler in entry_64.S adds a gap between the
|
|
* stack where the break point happened, and the saving of
|
|
* pt_regs. We can extend the original stack because of
|
|
* this gap. See the idtentry macro's create_gap option.
|
|
*
|
|
* Similarly entry_32.S will have a gap on the stack for (any) hardware
|
|
* exception and pt_regs; see FIXUP_FRAME.
|
|
*/
|
|
regs->sp -= sizeof(unsigned long);
|
|
*(unsigned long *)regs->sp = val;
|
|
}
|
|
|
|
static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func)
|
|
{
|
|
int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE);
|
|
int3_emulate_jmp(regs, func);
|
|
}
|
|
#endif /* !CONFIG_UML_X86 */
|
|
|
|
#endif /* _ASM_X86_TEXT_PATCHING_H */
|