mirror of
https://github.com/torvalds/linux.git
synced 2024-11-22 12:11:40 +00:00
mm: implement memory-deny-write-execute as a prctl
Patch series "mm: In-kernel support for memory-deny-write-execute (MDWE)", v2. The background to this is that systemd has a configuration option called MemoryDenyWriteExecute [2], implemented as a SECCOMP BPF filter. Its aim is to prevent a user task from inadvertently creating an executable mapping that is (or was) writeable. Since such BPF filter is stateless, it cannot detect mappings that were previously writeable but subsequently changed to read-only. Therefore the filter simply rejects any mprotect(PROT_EXEC). The side-effect is that on arm64 with BTI support (Branch Target Identification), the dynamic loader cannot change an ELF section from PROT_EXEC to PROT_EXEC|PROT_BTI using mprotect(). For libraries, it can resort to unmapping and re-mapping but for the main executable it does not have a file descriptor. The original bug report in the Red Hat bugzilla - [3] - and subsequent glibc workaround for libraries - [4]. This series adds in-kernel support for this feature as a prctl PR_SET_MDWE, that is inherited on fork(). The prctl denies PROT_WRITE | PROT_EXEC mappings. Like the systemd BPF filter it also denies adding PROT_EXEC to mappings. However unlike the BPF filter it only denies it if the mapping didn't previous have PROT_EXEC. This allows to PROT_EXEC -> PROT_EXEC | PROT_BTI with mprotect(), which is a problem with the BPF filter. This patch (of 2): The aim of such policy is to prevent a user task from creating an executable mapping that is also writeable. An example of mmap() returning -EACCESS if the policy is enabled: mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0); Similarly, mprotect() would return -EACCESS below: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC); The BPF filter that systemd MDWE uses is stateless, and disallows mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to be enabled if it was already PROT_EXEC, which allows the following case: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI); where PROT_BTI enables branch tracking identification on arm64. Link: https://lkml.kernel.org/r/20230119160344.54358-1-joey.gouly@arm.com Link: https://lkml.kernel.org/r/20230119160344.54358-2-joey.gouly@arm.com Signed-off-by: Joey Gouly <joey.gouly@arm.com> Co-developed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Jeremy Linton <jeremy.linton@arm.com> Cc: Kees Cook <keescook@chromium.org> Cc: Lennart Poettering <lennart@poettering.net> Cc: Mark Brown <broonie@kernel.org> Cc: nd <nd@arm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Szabolcs Nagy <szabolcs.nagy@arm.com> Cc: Topi Miettinen <toiwoton@gmail.com> Cc: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> Cc: David Hildenbrand <david@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
e6d2c436ff
commit
b507808ebc
@ -156,4 +156,38 @@ calc_vm_flag_bits(unsigned long flags)
|
||||
}
|
||||
|
||||
unsigned long vm_commit_limit(void);
|
||||
|
||||
/*
|
||||
* Denies creating a writable executable mapping or gaining executable permissions.
|
||||
*
|
||||
* This denies the following:
|
||||
*
|
||||
* a) mmap(PROT_WRITE | PROT_EXEC)
|
||||
*
|
||||
* b) mmap(PROT_WRITE)
|
||||
* mprotect(PROT_EXEC)
|
||||
*
|
||||
* c) mmap(PROT_WRITE)
|
||||
* mprotect(PROT_READ)
|
||||
* mprotect(PROT_EXEC)
|
||||
*
|
||||
* But allows the following:
|
||||
*
|
||||
* d) mmap(PROT_READ | PROT_EXEC)
|
||||
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
|
||||
*/
|
||||
static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
|
||||
{
|
||||
if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
|
||||
return false;
|
||||
|
||||
if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
|
||||
return true;
|
||||
|
||||
if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* _LINUX_MMAN_H */
|
||||
|
@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm)
|
||||
* lifecycle of this mm, just for simplicity.
|
||||
*/
|
||||
#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
|
||||
|
||||
#define MMF_HAS_MDWE 28
|
||||
#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)
|
||||
|
||||
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
|
||||
|
||||
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
|
||||
MMF_DISABLE_THP_MASK)
|
||||
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
|
||||
|
||||
#endif /* _LINUX_SCHED_COREDUMP_H */
|
||||
|
@ -281,6 +281,12 @@ struct prctl_mm_map {
|
||||
# define PR_SME_VL_LEN_MASK 0xffff
|
||||
# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */
|
||||
|
||||
/* Memory deny write / execute */
|
||||
#define PR_SET_MDWE 65
|
||||
# define PR_MDWE_REFUSE_EXEC_GAIN 1
|
||||
|
||||
#define PR_GET_MDWE 66
|
||||
|
||||
#define PR_SET_VMA 0x53564d41
|
||||
# define PR_SET_VMA_ANON_NAME 0
|
||||
|
||||
|
33
kernel/sys.c
33
kernel/sys.c
@ -2348,6 +2348,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
|
||||
}
|
||||
#endif /* CONFIG_ANON_VMA_NAME */
|
||||
|
||||
static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
|
||||
unsigned long arg4, unsigned long arg5)
|
||||
{
|
||||
if (arg3 || arg4 || arg5)
|
||||
return -EINVAL;
|
||||
|
||||
if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
|
||||
return -EINVAL;
|
||||
|
||||
if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
|
||||
set_bit(MMF_HAS_MDWE, ¤t->mm->flags);
|
||||
else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
|
||||
return -EPERM; /* Cannot unset the flag */
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
|
||||
unsigned long arg4, unsigned long arg5)
|
||||
{
|
||||
if (arg2 || arg3 || arg4 || arg5)
|
||||
return -EINVAL;
|
||||
|
||||
return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ?
|
||||
PR_MDWE_REFUSE_EXEC_GAIN : 0;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
|
||||
unsigned long, arg4, unsigned long, arg5)
|
||||
{
|
||||
@ -2623,6 +2650,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
|
||||
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
|
||||
break;
|
||||
#endif
|
||||
case PR_SET_MDWE:
|
||||
error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
|
||||
break;
|
||||
case PR_GET_MDWE:
|
||||
error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
|
||||
break;
|
||||
case PR_SET_VMA:
|
||||
error = prctl_set_vma(arg2, arg3, arg4, arg5);
|
||||
break;
|
||||
|
10
mm/mmap.c
10
mm/mmap.c
@ -2669,6 +2669,16 @@ cannot_expand:
|
||||
vma_set_anonymous(vma);
|
||||
}
|
||||
|
||||
if (map_deny_write_exec(vma, vma->vm_flags)) {
|
||||
error = -EACCES;
|
||||
if (file)
|
||||
goto close_and_free_vma;
|
||||
else if (vma->vm_file)
|
||||
goto unmap_and_free_vma;
|
||||
else
|
||||
goto free_vma;
|
||||
}
|
||||
|
||||
/* Allow architectures to sanity-check the vm_flags */
|
||||
if (!arch_validate_flags(vma->vm_flags)) {
|
||||
error = -EINVAL;
|
||||
|
@ -799,6 +799,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
|
||||
break;
|
||||
}
|
||||
|
||||
if (map_deny_write_exec(vma, newflags)) {
|
||||
error = -EACCES;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Allow architectures to sanity-check the new flags */
|
||||
if (!arch_validate_flags(newflags)) {
|
||||
error = -EINVAL;
|
||||
|
Loading…
Reference in New Issue
Block a user