Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "The changes in here are:

   - text_poke() fixes and an extensive set of executability lockdowns,
     to (hopefully) eliminate the last residual circumstances under
     which we are using W|X mappings even temporarily on x86 kernels.
     This required a broad range of surgery in text patching facilities,
     module loading, trampoline handling and other bits.

   - tweak page fault messages to be more informative and more
     structured.

   - remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the
     default.

   - reduce KASLR granularity on 5-level paging kernels from 512 GB to
     1 GB.

   - misc other changes and updates"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
  x86/mm: Initialize PGD cache during mm initialization
  x86/alternatives: Add comment about module removal races
  x86/kprobes: Use vmalloc special flag
  x86/ftrace: Use vmalloc special flag
  bpf: Use vmalloc special flag
  modules: Use vmalloc special flag
  mm/vmalloc: Add flag for freeing of special permsissions
  mm/hibernation: Make hibernation handle unmapped pages
  x86/mm/cpa: Add set_direct_map_*() functions
  x86/alternatives: Remove the return value of text_poke_*()
  x86/jump-label: Remove support for custom text poker
  x86/modules: Avoid breaking W^X while loading modules
  x86/kprobes: Set instruction page as executable
  x86/ftrace: Set trampoline pages as executable
  x86/kgdb: Avoid redundant comparison of patched code
  x86/alternatives: Use temporary mm for text poking
  x86/alternatives: Initialize temporary mm for patching
  fork: Provide a function for copying init_mm
  uprobes: Initialize uprobes earlier
  x86/mm: Save debug registers when loading a temporary mm
  ...
This commit is contained in:
Linus Torvalds
2019-05-06 16:13:31 -07:00
40 changed files with 710 additions and 342 deletions

View File

@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
bpf_jit_binary_unlock_ro(hdr);
bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));

View File

@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
.priority = INT_MAX-1, /* notified after kprobes, kgdb */
};
static int __init init_uprobes(void)
void __init uprobes_init(void)
{
int i;
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
if (percpu_init_rwsem(&dup_mmap_sem))
return -ENOMEM;
BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
return register_die_notifier(&uprobe_exception_nb);
BUG_ON(register_die_notifier(&uprobe_exception_nb));
}
__initcall(init_uprobes);

View File

@@ -815,6 +815,7 @@ void __init fork_init(void)
#endif
lockdep_init_task(&init_task);
uprobes_init();
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
complete_vfork_done(tsk);
}
/*
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
/**
* dup_mm() - duplicates an existing mm structure
* @tsk: the task_struct with which the new mm will be associated.
* @oldmm: the mm to duplicate.
*
* Allocates a new mm structure and duplicates the provided @oldmm structure
* content into it.
*
* Return: the duplicated mm or NULL on failure.
*/
static struct mm_struct *dup_mm(struct task_struct *tsk)
static struct mm_struct *dup_mm(struct task_struct *tsk,
struct mm_struct *oldmm)
{
struct mm_struct *mm, *oldmm = current->mm;
struct mm_struct *mm;
int err;
mm = allocate_mm();
@@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
}
retval = -ENOMEM;
mm = dup_mm(tsk);
mm = dup_mm(tsk, current->mm);
if (!mm)
goto fail_nomem;
@@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu)
return task;
}
struct mm_struct *copy_init_mm(void)
{
return dup_mm(NULL, &init_mm);
}
/*
* Ok, this is the main fork-routine.
*

View File

@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
/* Work queue for freeing init sections in success case */
static struct work_struct init_free_wq;
static struct llist_head init_free_list;
#ifdef CONFIG_MODULES_TREE_LOOKUP
/*
@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
if (!rodata_enabled)
return;
set_vm_flush_reset_perms(mod->core_layout.base);
set_vm_flush_reset_perms(mod->init_layout.base);
frob_text(&mod->core_layout, set_memory_ro);
frob_text(&mod->core_layout, set_memory_x);
frob_rodata(&mod->core_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_x);
frob_rodata(&mod->init_layout, set_memory_ro);
if (after_init)
@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
frob_writable_data(&mod->init_layout, set_memory_nx);
}
static void module_disable_nx(const struct module *mod)
{
frob_rodata(&mod->core_layout, set_memory_x);
frob_ro_after_init(&mod->core_layout, set_memory_x);
frob_writable_data(&mod->core_layout, set_memory_x);
frob_rodata(&mod->init_layout, set_memory_x);
frob_writable_data(&mod->init_layout, set_memory_x);
}
/* Iterate through all modules and set each module's text as RW */
void set_all_modules_text_rw(void)
{
@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)
}
mutex_unlock(&module_mutex);
}
static void disable_ro_nx(const struct module_layout *layout)
{
if (rodata_enabled) {
frob_text(layout, set_memory_rw);
frob_rodata(layout, set_memory_rw);
frob_ro_after_init(layout, set_memory_rw);
}
frob_rodata(layout, set_memory_x);
frob_ro_after_init(layout, set_memory_x);
frob_writable_data(layout, set_memory_x);
}
#else
static void disable_ro_nx(const struct module_layout *layout) { }
static void module_enable_nx(const struct module *mod) { }
static void module_disable_nx(const struct module *mod) { }
#endif
#ifdef CONFIG_LIVEPATCH
@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)
void __weak module_memfree(void *module_region)
{
/*
* This memory may be RO, and freeing RO memory in an interrupt is not
* supported by vmalloc.
*/
WARN_ON(in_interrupt());
vfree(module_region);
}
@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)
mutex_unlock(&module_mutex);
/* This may be empty, but that's OK */
disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base);
kfree(mod->args);
@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)
lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
/* Finally, free the core (containing the module structure) */
disable_ro_nx(&mod->core_layout);
module_memfree(mod->core_layout.base);
}
@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
/* For freeing module_init on success, in case kallsyms traversing */
struct mod_initfree {
struct rcu_head rcu;
struct llist_node node;
void *module_init;
};
static void do_free_init(struct rcu_head *head)
static void do_free_init(struct work_struct *w)
{
struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
module_memfree(m->module_init);
kfree(m);
struct llist_node *pos, *n, *list;
struct mod_initfree *initfree;
list = llist_del_all(&init_free_list);
synchronize_rcu();
llist_for_each_safe(pos, n, list) {
initfree = container_of(pos, struct mod_initfree, node);
module_memfree(initfree->module_init);
kfree(initfree);
}
}
static int __init modules_wq_init(void)
{
INIT_WORK(&init_free_wq, do_free_init);
init_llist_head(&init_free_list);
return 0;
}
module_init(modules_wq_init);
/*
* This is where the real work happens.
*
@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
#endif
module_enable_ro(mod, true);
mod_tree_remove_init(mod);
disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
mod->init_layout.base = NULL;
mod->init_layout.size = 0;
@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
* call synchronize_rcu(), but we don't want to slow down the success
* path, so use actual RCU here.
* path. module_memfree() cannot be called in an interrupt, so do the
* work and call synchronize_rcu() in a work queue.
*
* Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to
* be cleaned up needs to sync with the queued work - ie
* rcu_barrier()
*/
call_rcu(&freeinit->rcu, do_free_init);
if (llist_add(&freeinit->node, &init_free_list))
schedule_work(&init_free_wq);
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
/* we can't deallocate the module until we clear memory protection */
module_disable_ro(mod);
module_disable_nx(mod);
ddebug_cleanup:
ftrace_release_mod(mod);
dynamic_debug_remove(mod, info->debug);

View File

@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
* safe_copy_page - Copy a page in a safe way.
*
* Check if the page we are going to copy is marked as present in the kernel
* page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
* and in that case kernel_page_present() always returns 'true').
* page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
* CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
* always returns 'true'.
*/
static void safe_copy_page(void *dst, struct page *s_page)
{

View File

@@ -14,6 +14,8 @@
#include <linux/syscalls.h>
#include <linux/error-injection.h>
#include <asm/tlb.h>
#include "trace_probe.h"
#include "trace.h"
@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
* access_ok() should prevent writing to non-user memory, but in
* some situations (nommu, temporary switch, etc) access_ok() does
* not provide enough validation, hence the check on KERNEL_DS.
*
* nmi_uaccess_okay() ensures the probe is not run in an interim
* state, when the task or mm are switched. This is specifically
* required to prevent the use of temporary mm.
*/
if (unlikely(in_interrupt() ||
@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
return -EPERM;
if (unlikely(uaccess_kernel()))
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
if (!access_ok(unsafe_ptr, size))
return -EPERM;