Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"The changes in here are:
- text_poke() fixes and an extensive set of executability lockdowns,
to (hopefully) eliminate the last residual circumstances under
which we are using W|X mappings even temporarily on x86 kernels.
This required a broad range of surgery in text patching facilities,
module loading, trampoline handling and other bits.
- tweak page fault messages to be more informative and more
structured.
- remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the
default.
- reduce KASLR granularity on 5-level paging kernels from 512 GB to
1 GB.
- misc other changes and updates"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
x86/mm: Initialize PGD cache during mm initialization
x86/alternatives: Add comment about module removal races
x86/kprobes: Use vmalloc special flag
x86/ftrace: Use vmalloc special flag
bpf: Use vmalloc special flag
modules: Use vmalloc special flag
mm/vmalloc: Add flag for freeing of special permsissions
mm/hibernation: Make hibernation handle unmapped pages
x86/mm/cpa: Add set_direct_map_*() functions
x86/alternatives: Remove the return value of text_poke_*()
x86/jump-label: Remove support for custom text poker
x86/modules: Avoid breaking W^X while loading modules
x86/kprobes: Set instruction page as executable
x86/ftrace: Set trampoline pages as executable
x86/kgdb: Avoid redundant comparison of patched code
x86/alternatives: Use temporary mm for text poking
x86/alternatives: Initialize temporary mm for patching
fork: Provide a function for copying init_mm
uprobes: Initialize uprobes earlier
x86/mm: Save debug registers when loading a temporary mm
...
This commit is contained in:
@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
|
||||
if (fp->jited) {
|
||||
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
|
||||
|
||||
bpf_jit_binary_unlock_ro(hdr);
|
||||
bpf_jit_binary_free(hdr);
|
||||
|
||||
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
|
||||
|
||||
@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
|
||||
.priority = INT_MAX-1, /* notified after kprobes, kgdb */
|
||||
};
|
||||
|
||||
static int __init init_uprobes(void)
|
||||
void __init uprobes_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < UPROBES_HASH_SZ; i++)
|
||||
mutex_init(&uprobes_mmap_mutex[i]);
|
||||
|
||||
if (percpu_init_rwsem(&dup_mmap_sem))
|
||||
return -ENOMEM;
|
||||
BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
|
||||
|
||||
return register_die_notifier(&uprobe_exception_nb);
|
||||
BUG_ON(register_die_notifier(&uprobe_exception_nb));
|
||||
}
|
||||
__initcall(init_uprobes);
|
||||
|
||||
@@ -815,6 +815,7 @@ void __init fork_init(void)
|
||||
#endif
|
||||
|
||||
lockdep_init_task(&init_task);
|
||||
uprobes_init();
|
||||
}
|
||||
|
||||
int __weak arch_dup_task_struct(struct task_struct *dst,
|
||||
@@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
|
||||
complete_vfork_done(tsk);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a new mm structure and copy contents from the
|
||||
* mm structure of the passed in task structure.
|
||||
/**
|
||||
* dup_mm() - duplicates an existing mm structure
|
||||
* @tsk: the task_struct with which the new mm will be associated.
|
||||
* @oldmm: the mm to duplicate.
|
||||
*
|
||||
* Allocates a new mm structure and duplicates the provided @oldmm structure
|
||||
* content into it.
|
||||
*
|
||||
* Return: the duplicated mm or NULL on failure.
|
||||
*/
|
||||
static struct mm_struct *dup_mm(struct task_struct *tsk)
|
||||
static struct mm_struct *dup_mm(struct task_struct *tsk,
|
||||
struct mm_struct *oldmm)
|
||||
{
|
||||
struct mm_struct *mm, *oldmm = current->mm;
|
||||
struct mm_struct *mm;
|
||||
int err;
|
||||
|
||||
mm = allocate_mm();
|
||||
@@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
|
||||
}
|
||||
|
||||
retval = -ENOMEM;
|
||||
mm = dup_mm(tsk);
|
||||
mm = dup_mm(tsk, current->mm);
|
||||
if (!mm)
|
||||
goto fail_nomem;
|
||||
|
||||
@@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu)
|
||||
return task;
|
||||
}
|
||||
|
||||
struct mm_struct *copy_init_mm(void)
|
||||
{
|
||||
return dup_mm(NULL, &init_mm);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ok, this is the main fork-routine.
|
||||
*
|
||||
|
||||
@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
|
||||
EXPORT_SYMBOL_GPL(module_mutex);
|
||||
static LIST_HEAD(modules);
|
||||
|
||||
/* Work queue for freeing init sections in success case */
|
||||
static struct work_struct init_free_wq;
|
||||
static struct llist_head init_free_list;
|
||||
|
||||
#ifdef CONFIG_MODULES_TREE_LOOKUP
|
||||
|
||||
/*
|
||||
@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
|
||||
if (!rodata_enabled)
|
||||
return;
|
||||
|
||||
set_vm_flush_reset_perms(mod->core_layout.base);
|
||||
set_vm_flush_reset_perms(mod->init_layout.base);
|
||||
frob_text(&mod->core_layout, set_memory_ro);
|
||||
frob_text(&mod->core_layout, set_memory_x);
|
||||
|
||||
frob_rodata(&mod->core_layout, set_memory_ro);
|
||||
|
||||
frob_text(&mod->init_layout, set_memory_ro);
|
||||
frob_text(&mod->init_layout, set_memory_x);
|
||||
|
||||
frob_rodata(&mod->init_layout, set_memory_ro);
|
||||
|
||||
if (after_init)
|
||||
@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
|
||||
frob_writable_data(&mod->init_layout, set_memory_nx);
|
||||
}
|
||||
|
||||
static void module_disable_nx(const struct module *mod)
|
||||
{
|
||||
frob_rodata(&mod->core_layout, set_memory_x);
|
||||
frob_ro_after_init(&mod->core_layout, set_memory_x);
|
||||
frob_writable_data(&mod->core_layout, set_memory_x);
|
||||
frob_rodata(&mod->init_layout, set_memory_x);
|
||||
frob_writable_data(&mod->init_layout, set_memory_x);
|
||||
}
|
||||
|
||||
/* Iterate through all modules and set each module's text as RW */
|
||||
void set_all_modules_text_rw(void)
|
||||
{
|
||||
@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)
|
||||
}
|
||||
mutex_unlock(&module_mutex);
|
||||
}
|
||||
|
||||
static void disable_ro_nx(const struct module_layout *layout)
|
||||
{
|
||||
if (rodata_enabled) {
|
||||
frob_text(layout, set_memory_rw);
|
||||
frob_rodata(layout, set_memory_rw);
|
||||
frob_ro_after_init(layout, set_memory_rw);
|
||||
}
|
||||
frob_rodata(layout, set_memory_x);
|
||||
frob_ro_after_init(layout, set_memory_x);
|
||||
frob_writable_data(layout, set_memory_x);
|
||||
}
|
||||
|
||||
#else
|
||||
static void disable_ro_nx(const struct module_layout *layout) { }
|
||||
static void module_enable_nx(const struct module *mod) { }
|
||||
static void module_disable_nx(const struct module *mod) { }
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_LIVEPATCH
|
||||
@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)
|
||||
|
||||
void __weak module_memfree(void *module_region)
|
||||
{
|
||||
/*
|
||||
* This memory may be RO, and freeing RO memory in an interrupt is not
|
||||
* supported by vmalloc.
|
||||
*/
|
||||
WARN_ON(in_interrupt());
|
||||
vfree(module_region);
|
||||
}
|
||||
|
||||
@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)
|
||||
mutex_unlock(&module_mutex);
|
||||
|
||||
/* This may be empty, but that's OK */
|
||||
disable_ro_nx(&mod->init_layout);
|
||||
module_arch_freeing_init(mod);
|
||||
module_memfree(mod->init_layout.base);
|
||||
kfree(mod->args);
|
||||
@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)
|
||||
lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
|
||||
|
||||
/* Finally, free the core (containing the module structure) */
|
||||
disable_ro_nx(&mod->core_layout);
|
||||
module_memfree(mod->core_layout.base);
|
||||
}
|
||||
|
||||
@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
|
||||
|
||||
/* For freeing module_init on success, in case kallsyms traversing */
|
||||
struct mod_initfree {
|
||||
struct rcu_head rcu;
|
||||
struct llist_node node;
|
||||
void *module_init;
|
||||
};
|
||||
|
||||
static void do_free_init(struct rcu_head *head)
|
||||
static void do_free_init(struct work_struct *w)
|
||||
{
|
||||
struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
|
||||
module_memfree(m->module_init);
|
||||
kfree(m);
|
||||
struct llist_node *pos, *n, *list;
|
||||
struct mod_initfree *initfree;
|
||||
|
||||
list = llist_del_all(&init_free_list);
|
||||
|
||||
synchronize_rcu();
|
||||
|
||||
llist_for_each_safe(pos, n, list) {
|
||||
initfree = container_of(pos, struct mod_initfree, node);
|
||||
module_memfree(initfree->module_init);
|
||||
kfree(initfree);
|
||||
}
|
||||
}
|
||||
|
||||
static int __init modules_wq_init(void)
|
||||
{
|
||||
INIT_WORK(&init_free_wq, do_free_init);
|
||||
init_llist_head(&init_free_list);
|
||||
return 0;
|
||||
}
|
||||
module_init(modules_wq_init);
|
||||
|
||||
/*
|
||||
* This is where the real work happens.
|
||||
*
|
||||
@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
|
||||
#endif
|
||||
module_enable_ro(mod, true);
|
||||
mod_tree_remove_init(mod);
|
||||
disable_ro_nx(&mod->init_layout);
|
||||
module_arch_freeing_init(mod);
|
||||
mod->init_layout.base = NULL;
|
||||
mod->init_layout.size = 0;
|
||||
@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
|
||||
* We want to free module_init, but be aware that kallsyms may be
|
||||
* walking this with preempt disabled. In all the failure paths, we
|
||||
* call synchronize_rcu(), but we don't want to slow down the success
|
||||
* path, so use actual RCU here.
|
||||
* path. module_memfree() cannot be called in an interrupt, so do the
|
||||
* work and call synchronize_rcu() in a work queue.
|
||||
*
|
||||
* Note that module_alloc() on most architectures creates W+X page
|
||||
* mappings which won't be cleaned up until do_free_init() runs. Any
|
||||
* code such as mark_rodata_ro() which depends on those mappings to
|
||||
* be cleaned up needs to sync with the queued work - ie
|
||||
* rcu_barrier()
|
||||
*/
|
||||
call_rcu(&freeinit->rcu, do_free_init);
|
||||
if (llist_add(&freeinit->node, &init_free_list))
|
||||
schedule_work(&init_free_wq);
|
||||
|
||||
mutex_unlock(&module_mutex);
|
||||
wake_up_all(&module_wq);
|
||||
|
||||
@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
|
||||
module_bug_cleanup(mod);
|
||||
mutex_unlock(&module_mutex);
|
||||
|
||||
/* we can't deallocate the module until we clear memory protection */
|
||||
module_disable_ro(mod);
|
||||
module_disable_nx(mod);
|
||||
|
||||
ddebug_cleanup:
|
||||
ftrace_release_mod(mod);
|
||||
dynamic_debug_remove(mod, info->debug);
|
||||
|
||||
@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
|
||||
* safe_copy_page - Copy a page in a safe way.
|
||||
*
|
||||
* Check if the page we are going to copy is marked as present in the kernel
|
||||
* page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
|
||||
* and in that case kernel_page_present() always returns 'true').
|
||||
* page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
|
||||
* CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
|
||||
* always returns 'true'.
|
||||
*/
|
||||
static void safe_copy_page(void *dst, struct page *s_page)
|
||||
{
|
||||
|
||||
@@ -14,6 +14,8 @@
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/error-injection.h>
|
||||
|
||||
#include <asm/tlb.h>
|
||||
|
||||
#include "trace_probe.h"
|
||||
#include "trace.h"
|
||||
|
||||
@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
|
||||
* access_ok() should prevent writing to non-user memory, but in
|
||||
* some situations (nommu, temporary switch, etc) access_ok() does
|
||||
* not provide enough validation, hence the check on KERNEL_DS.
|
||||
*
|
||||
* nmi_uaccess_okay() ensures the probe is not run in an interim
|
||||
* state, when the task or mm are switched. This is specifically
|
||||
* required to prevent the use of temporary mm.
|
||||
*/
|
||||
|
||||
if (unlikely(in_interrupt() ||
|
||||
@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
|
||||
return -EPERM;
|
||||
if (unlikely(uaccess_kernel()))
|
||||
return -EPERM;
|
||||
if (unlikely(!nmi_uaccess_okay()))
|
||||
return -EPERM;
|
||||
if (!access_ok(unsafe_ptr, size))
|
||||
return -EPERM;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user