2019-08-25 09:49:17 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2006-01-08 09:01:31 +00:00
|
|
|
/*
|
2007-10-16 08:26:54 +00:00
|
|
|
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
|
|
|
|
2007-10-16 08:26:54 +00:00
|
|
|
#include <linux/mm.h>
|
2017-02-08 17:51:30 +00:00
|
|
|
#include <linux/sched/signal.h>
|
2007-10-16 08:26:54 +00:00
|
|
|
#include <linux/hardirq.h>
|
2011-08-18 19:14:10 +00:00
|
|
|
#include <linux/module.h>
|
2015-05-19 11:53:29 +00:00
|
|
|
#include <linux/uaccess.h>
|
2017-02-08 17:51:35 +00:00
|
|
|
#include <linux/sched/debug.h>
|
2007-10-16 08:26:54 +00:00
|
|
|
#include <asm/current.h>
|
|
|
|
#include <asm/tlbflush.h>
|
2012-10-08 02:27:32 +00:00
|
|
|
#include <arch.h>
|
|
|
|
#include <as-layout.h>
|
|
|
|
#include <kern_util.h>
|
|
|
|
#include <os.h>
|
|
|
|
#include <skas.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2007-10-16 08:26:54 +00:00
|
|
|
/*
|
2018-10-26 18:02:47 +00:00
|
|
|
* Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by
|
2007-10-16 08:26:54 +00:00
|
|
|
* segv().
|
|
|
|
*/
|
2006-07-10 11:45:13 +00:00
|
|
|
int handle_page_fault(unsigned long address, unsigned long ip,
|
2005-04-16 22:20:36 +00:00
|
|
|
int is_write, int is_user, int *code_out)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
|
|
|
int err = -EFAULT;
|
2020-04-02 04:08:37 +00:00
|
|
|
unsigned int flags = FAULT_FLAG_DEFAULT;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
*code_out = SEGV_MAPERR;
|
2005-09-23 04:44:20 +00:00
|
|
|
|
2007-10-16 08:26:54 +00:00
|
|
|
/*
|
2015-05-11 15:52:11 +00:00
|
|
|
* If the fault was with pagefaults disabled, don't take the fault, just
|
2007-10-16 08:26:54 +00:00
|
|
|
* fail.
|
|
|
|
*/
|
2015-05-11 15:52:11 +00:00
|
|
|
if (faulthandler_disabled())
|
2005-09-23 04:44:20 +00:00
|
|
|
goto out_nosemaphore;
|
|
|
|
|
2013-09-12 22:13:39 +00:00
|
|
|
if (is_user)
|
|
|
|
flags |= FAULT_FLAG_USER;
|
2012-05-31 23:26:03 +00:00
|
|
|
retry:
|
2020-06-09 04:33:25 +00:00
|
|
|
mmap_read_lock(mm);
|
2005-04-16 22:20:36 +00:00
|
|
|
vma = find_vma(mm, address);
|
2007-10-16 08:26:54 +00:00
|
|
|
if (!vma)
|
2005-04-16 22:20:36 +00:00
|
|
|
goto out;
|
2023-06-24 20:45:51 +00:00
|
|
|
if (vma->vm_start <= address)
|
2005-04-16 22:20:36 +00:00
|
|
|
goto good_area;
|
2023-06-24 20:45:51 +00:00
|
|
|
if (!(vma->vm_flags & VM_GROWSDOWN))
|
2005-04-16 22:20:36 +00:00
|
|
|
goto out;
|
2023-06-24 20:45:51 +00:00
|
|
|
if (is_user && !ARCH_IS_STACKGROW(address))
|
2005-04-16 22:20:36 +00:00
|
|
|
goto out;
|
2023-06-24 20:45:51 +00:00
|
|
|
vma = expand_stack(mm, address);
|
|
|
|
if (!vma)
|
|
|
|
goto out_nosemaphore;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2005-09-03 22:57:26 +00:00
|
|
|
good_area:
|
2005-04-16 22:20:36 +00:00
|
|
|
*code_out = SEGV_ACCERR;
|
2013-09-12 22:13:39 +00:00
|
|
|
if (is_write) {
|
|
|
|
if (!(vma->vm_flags & VM_WRITE))
|
|
|
|
goto out;
|
|
|
|
flags |= FAULT_FLAG_WRITE;
|
|
|
|
} else {
|
|
|
|
/* Don't require VM_READ|VM_EXEC for write faults! */
|
|
|
|
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
|
|
|
|
goto out;
|
|
|
|
}
|
2005-05-20 20:59:08 +00:00
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
do {
|
2018-08-17 22:44:47 +00:00
|
|
|
vm_fault_t fault;
|
2009-01-06 22:38:59 +00:00
|
|
|
|
mm: do page fault accounting in handle_mm_fault
Patch series "mm: Page fault accounting cleanups", v5.
This is v5 of the pf accounting cleanup series. It originates from Gerald
Schaefer's report on an issue a week ago regarding to incorrect page fault
accountings for retried page fault after commit 4064b9827063 ("mm: allow
VM_FAULT_RETRY for multiple times"):
https://lore.kernel.org/lkml/20200610174811.44b94525@thinkpad/
What this series did:
- Correct page fault accounting: we do accounting for a page fault
(no matter whether it's from #PF handling, or gup, or anything else)
only with the one that completed the fault. For example, page fault
retries should not be counted in page fault counters. Same to the
perf events.
- Unify definition of PERF_COUNT_SW_PAGE_FAULTS: currently this perf
event is used in an adhoc way across different archs.
Case (1): for many archs it's done at the entry of a page fault
handler, so that it will also cover e.g. errornous faults.
Case (2): for some other archs, it is only accounted when the page
fault is resolved successfully.
Case (3): there're still quite some archs that have not enabled
this perf event.
Since this series will touch merely all the archs, we unify this
perf event to always follow case (1), which is the one that makes most
sense. And since we moved the accounting into handle_mm_fault, the
other two MAJ/MIN perf events are well taken care of naturally.
- Unify definition of "major faults": the definition of "major
fault" is slightly changed when used in accounting (not
VM_FAULT_MAJOR). More information in patch 1.
- Always account the page fault onto the one that triggered the page
fault. This does not matter much for #PF handlings, but mostly for
gup. More information on this in patch 25.
Patchset layout:
Patch 1: Introduced the accounting in handle_mm_fault(), not enabled.
Patch 2-23: Enable the new accounting for arch #PF handlers one by one.
Patch 24: Enable the new accounting for the rest outliers (gup, iommu, etc.)
Patch 25: Cleanup GUP task_struct pointer since it's not needed any more
This patch (of 25):
This is a preparation patch to move page fault accountings into the
general code in handle_mm_fault(). This includes both the per task
flt_maj/flt_min counters, and the major/minor page fault perf events. To
do this, the pt_regs pointer is passed into handle_mm_fault().
PERF_COUNT_SW_PAGE_FAULTS should still be kept in per-arch page fault
handlers.
So far, all the pt_regs pointer that passed into handle_mm_fault() is
NULL, which means this patch should have no intented functional change.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Cain <bcain@codeaurora.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Ley Foon Tan <ley.foon.tan@intel.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nick Hu <nickhu@andestech.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vincent Chen <deanbo422@gmail.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Link: http://lkml.kernel.org/r/20200707225021.200906-1-peterx@redhat.com
Link: http://lkml.kernel.org/r/20200707225021.200906-2-peterx@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-12 01:37:44 +00:00
|
|
|
fault = handle_mm_fault(vma, address, flags, NULL);
|
2012-05-31 23:26:03 +00:00
|
|
|
|
|
|
|
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
|
|
|
|
goto out_nosemaphore;
|
|
|
|
|
mm: avoid unnecessary page fault retires on shared memory types
I observed that for each of the shared file-backed page faults, we're very
likely to retry one more time for the 1st write fault upon no page. It's
because we'll need to release the mmap lock for dirty rate limit purpose
with balance_dirty_pages_ratelimited() (in fault_dirty_shared_page()).
Then after that throttling we return VM_FAULT_RETRY.
We did that probably because VM_FAULT_RETRY is the only way we can return
to the fault handler at that time telling it we've released the mmap lock.
However that's not ideal because it's very likely the fault does not need
to be retried at all since the pgtable was well installed before the
throttling, so the next continuous fault (including taking mmap read lock,
walk the pgtable, etc.) could be in most cases unnecessary.
It's not only slowing down page faults for shared file-backed, but also add
more mmap lock contention which is in most cases not needed at all.
To observe this, one could try to write to some shmem page and look at
"pgfault" value in /proc/vmstat, then we should expect 2 counts for each
shmem write simply because we retried, and vm event "pgfault" will capture
that.
To make it more efficient, add a new VM_FAULT_COMPLETED return code just to
show that we've completed the whole fault and released the lock. It's also
a hint that we should very possibly not need another fault immediately on
this page because we've just completed it.
This patch provides a ~12% perf boost on my aarch64 test VM with a simple
program sequentially dirtying 400MB shmem file being mmap()ed and these are
the time it needs:
Before: 650.980 ms (+-1.94%)
After: 569.396 ms (+-1.38%)
I believe it could help more than that.
We need some special care on GUP and the s390 pgfault handler (for gmap
code before returning from pgfault), the rest changes in the page fault
handlers should be relatively straightforward.
Another thing to mention is that mm_account_fault() does take this new
fault as a generic fault to be accounted, unlike VM_FAULT_RETRY.
I explicitly didn't touch hmm_vma_fault() and break_ksm() because they do
not handle VM_FAULT_RETRY even with existing code, so I'm literally keeping
them as-is.
Link: https://lkml.kernel.org/r/20220530183450.42886-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vineet Gupta <vgupta@kernel.org>
Acked-by: Guo Ren <guoren@kernel.org>
Acked-by: Max Filippov <jcmvbkbc@gmail.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk> [arm part]
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Stafford Horne <shorne@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Weinberger <richard@nod.at>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Will Deacon <will@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Chris Zankel <chris@zankel.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Rich Felker <dalias@libc.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Helge Deller <deller@gmx.de>
Cc: Yoshinori Sato <ysato@users.osdn.me>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-05-30 18:34:50 +00:00
|
|
|
/* The fault is fully completed (including releasing mmap lock) */
|
|
|
|
if (fault & VM_FAULT_COMPLETED)
|
|
|
|
return 0;
|
|
|
|
|
2007-07-19 08:47:05 +00:00
|
|
|
if (unlikely(fault & VM_FAULT_ERROR)) {
|
|
|
|
if (fault & VM_FAULT_OOM) {
|
|
|
|
goto out_of_memory;
|
vm: add VM_FAULT_SIGSEGV handling support
The core VM already knows about VM_FAULT_SIGBUS, but cannot return a
"you should SIGSEGV" error, because the SIGSEGV case was generally
handled by the caller - usually the architecture fault handler.
That results in lots of duplication - all the architecture fault
handlers end up doing very similar "look up vma, check permissions, do
retries etc" - but it generally works. However, there are cases where
the VM actually wants to SIGSEGV, and applications _expect_ SIGSEGV.
In particular, when accessing the stack guard page, libsigsegv expects a
SIGSEGV. And it usually got one, because the stack growth is handled by
that duplicated architecture fault handler.
However, when the generic VM layer started propagating the error return
from the stack expansion in commit fee7e49d4514 ("mm: propagate error
from stack expansion even for guard page"), that now exposed the
existing VM_FAULT_SIGBUS result to user space. And user space really
expected SIGSEGV, not SIGBUS.
To fix that case, we need to add a VM_FAULT_SIGSEGV, and teach all those
duplicate architecture fault handlers about it. They all already have
the code to handle SIGSEGV, so it's about just tying that new return
value to the existing code, but it's all a bit annoying.
This is the mindless minimal patch to do this. A more extensive patch
would be to try to gather up the mostly shared fault handling logic into
one generic helper routine, and long-term we really should do that
cleanup.
Just from this patch, you can generally see that most architectures just
copied (directly or indirectly) the old x86 way of doing things, but in
the meantime that original x86 model has been improved to hold the VM
semaphore for shorter times etc and to handle VM_FAULT_RETRY and other
"newer" things, so it would be a good idea to bring all those
improvements to the generic case and teach other architectures about
them too.
Reported-and-tested-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Jan Engelhardt <jengelh@inai.de>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> # "s390 still compiles and boots"
Cc: linux-arch@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-01-29 18:51:32 +00:00
|
|
|
} else if (fault & VM_FAULT_SIGSEGV) {
|
|
|
|
goto out;
|
2007-07-19 08:47:05 +00:00
|
|
|
} else if (fault & VM_FAULT_SIGBUS) {
|
|
|
|
err = -EACCES;
|
|
|
|
goto out;
|
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
BUG();
|
|
|
|
}
|
2022-01-14 22:05:51 +00:00
|
|
|
if (fault & VM_FAULT_RETRY) {
|
|
|
|
flags |= FAULT_FLAG_TRIED;
|
2012-05-31 23:26:03 +00:00
|
|
|
|
2022-01-14 22:05:51 +00:00
|
|
|
goto retry;
|
2012-05-31 23:26:03 +00:00
|
|
|
}
|
2007-07-19 08:47:05 +00:00
|
|
|
|
2020-06-09 04:33:05 +00:00
|
|
|
pmd = pmd_off(mm, address);
|
2005-09-03 22:57:26 +00:00
|
|
|
pte = pte_offset_kernel(pmd, address);
|
2007-10-16 08:26:54 +00:00
|
|
|
} while (!pte_present(*pte));
|
2005-04-16 22:20:36 +00:00
|
|
|
err = 0;
|
2007-10-16 08:26:54 +00:00
|
|
|
/*
|
|
|
|
* The below warning was added in place of
|
[PATCH] uml: remove bogus WARN_ON, triggerable harmlessly on a page fault race
The below warning was added in place of pte_mkyoung(); if (is_write)
pte_mkdirty();
In fact, if the PTE is not marked young/dirty, our dirty/accessed bit
emulation would cause the TLB permission not to be changed, and so we'd loop,
and given we don't support preemption yet, we'd busy-hang here.
However, I've seen this warning trigger without crashes during a loop of
concurrent kernel builds, at random times (i.e. like a race condition), and I
realized that two concurrent faults on the same page, one on read and one on
write, can trigger it. The read fault gets serviced and the PTE gets marked
writable but clean (it's possible on a shared-writable mapping), while the
generic code sees the PTE was already installed and returns without action. In
this case, we'll see another fault and service it normally.
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Acked-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-14 00:07:04 +00:00
|
|
|
* pte_mkyoung(); if (is_write) pte_mkdirty();
|
|
|
|
* If it's triggered, we'd see normally a hang here (a clean pte is
|
|
|
|
* marked read-only to emulate the dirty bit).
|
|
|
|
* However, the generic code can mark a PTE writable but clean on a
|
|
|
|
* concurrent read fault, triggering this harmlessly. So comment it out.
|
|
|
|
*/
|
|
|
|
#if 0
|
2005-09-10 17:44:58 +00:00
|
|
|
WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte)));
|
[PATCH] uml: remove bogus WARN_ON, triggerable harmlessly on a page fault race
The below warning was added in place of pte_mkyoung(); if (is_write)
pte_mkdirty();
In fact, if the PTE is not marked young/dirty, our dirty/accessed bit
emulation would cause the TLB permission not to be changed, and so we'd loop,
and given we don't support preemption yet, we'd busy-hang here.
However, I've seen this warning trigger without crashes during a loop of
concurrent kernel builds, at random times (i.e. like a race condition), and I
realized that two concurrent faults on the same page, one on read and one on
write, can trigger it. The read fault gets serviced and the PTE gets marked
writable but clean (it's possible on a shared-writable mapping), while the
generic code sees the PTE was already installed and returns without action. In
this case, we'll see another fault and service it normally.
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Acked-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-14 00:07:04 +00:00
|
|
|
#endif
|
2024-07-03 13:45:36 +00:00
|
|
|
|
2005-09-03 22:57:26 +00:00
|
|
|
out:
|
2020-06-09 04:33:25 +00:00
|
|
|
mmap_read_unlock(mm);
|
2005-09-23 04:44:20 +00:00
|
|
|
out_nosemaphore:
|
2007-10-16 08:26:54 +00:00
|
|
|
return err;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
out_of_memory:
|
2009-01-06 22:38:59 +00:00
|
|
|
/*
|
|
|
|
* We ran out of memory, call the OOM killer, and return the userspace
|
|
|
|
* (which will retry the fault, or kill us if we got oom-killed).
|
|
|
|
*/
|
2020-06-09 04:33:25 +00:00
|
|
|
mmap_read_unlock(mm);
|
2013-09-12 22:13:38 +00:00
|
|
|
if (!is_user)
|
|
|
|
goto out_nosemaphore;
|
2009-01-06 22:38:59 +00:00
|
|
|
pagefault_out_of_memory();
|
|
|
|
return 0;
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2011-05-25 00:13:03 +00:00
|
|
|
static void show_segv_info(struct uml_pt_regs *regs)
|
|
|
|
{
|
|
|
|
struct task_struct *tsk = current;
|
|
|
|
struct faultinfo *fi = UPT_FAULTINFO(regs);
|
|
|
|
|
|
|
|
if (!unhandled_signal(tsk, SIGSEGV))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!printk_ratelimit())
|
|
|
|
return;
|
|
|
|
|
2017-12-19 21:52:23 +00:00
|
|
|
printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
|
2011-05-25 00:13:03 +00:00
|
|
|
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
|
|
|
|
tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
|
|
|
|
(void *)UPT_IP(regs), (void *)UPT_SP(regs),
|
|
|
|
fi->error_code);
|
|
|
|
|
|
|
|
print_vma_addr(KERN_CONT " in ", UPT_IP(regs));
|
|
|
|
printk(KERN_CONT "\n");
|
|
|
|
}
|
|
|
|
|
2007-02-10 09:44:14 +00:00
|
|
|
static void bad_segv(struct faultinfo fi, unsigned long ip)
|
|
|
|
{
|
|
|
|
current->thread.arch.faultinfo = fi;
|
2019-05-23 16:04:24 +00:00
|
|
|
force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi));
|
2007-02-10 09:44:14 +00:00
|
|
|
}
|
|
|
|
|
2008-02-05 06:30:58 +00:00
|
|
|
void fatal_sigsegv(void)
|
|
|
|
{
|
2021-10-25 15:50:57 +00:00
|
|
|
force_fatal_sig(SIGSEGV);
|
2015-07-03 19:44:20 +00:00
|
|
|
do_signal(¤t->thread.regs);
|
2008-02-05 06:30:58 +00:00
|
|
|
/*
|
|
|
|
* This is to tell gcc that we're not returning - do_signal
|
|
|
|
* can, in general, return, but in this case, it's not, since
|
|
|
|
* we just got a fatal SIGSEGV queued.
|
|
|
|
*/
|
|
|
|
os_dump_core();
|
|
|
|
}
|
|
|
|
|
2017-07-05 22:34:04 +00:00
|
|
|
/**
|
|
|
|
* segv_handler() - the SIGSEGV handler
|
|
|
|
* @sig: the signal number
|
|
|
|
* @unused_si: the signal info struct; unused in this handler
|
|
|
|
* @regs: the ptrace register information
|
|
|
|
*
|
|
|
|
* The handler first extracts the faultinfo from the UML ptrace regs struct.
|
|
|
|
* If the userfault did not happen in an UML userspace process, bad_segv is called.
|
|
|
|
* Otherwise the signal did happen in a cloned userspace process, handle it.
|
|
|
|
*/
|
2012-08-01 22:49:17 +00:00
|
|
|
void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
|
2006-01-08 09:01:32 +00:00
|
|
|
{
|
|
|
|
struct faultinfo * fi = UPT_FAULTINFO(regs);
|
|
|
|
|
2007-10-16 08:26:54 +00:00
|
|
|
if (UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)) {
|
2011-05-25 00:13:03 +00:00
|
|
|
show_segv_info(regs);
|
2006-01-08 09:01:32 +00:00
|
|
|
bad_segv(*fi, UPT_IP(regs));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs);
|
|
|
|
}
|
|
|
|
|
[PATCH] uml: S390 preparation, abstract host page fault data
This patch removes the arch-specific fault/trap-infos from thread and
skas-regs.
It adds a new struct faultinfo, that is arch-specific defined in
sysdep/faultinfo.h.
The structure is inserted in thread.arch and thread.regs.skas and
thread.regs.tt
Now, segv and other trap-handlers can copy the contents from regs.X.faultinfo
to thread.arch.faultinfo with one simple assignment.
Also, the number of macros necessary is reduced to
FAULT_ADDRESS(struct faultinfo)
extracts the faulting address from faultinfo
FAULT_WRITE(struct faultinfo)
extracts the "is_write" flag
SEGV_IS_FIXABLE(struct faultinfo)
is true for the fixable segvs, i.e. (TRAP == 14)
on i386
UPT_FAULTINFO(regs)
result is (struct faultinfo *) to the faultinfo
in regs->skas.faultinfo
GET_FAULTINFO_FROM_SC(struct faultinfo, struct sigcontext *)
copies the relevant parts of the sigcontext to
struct faultinfo.
On SIGSEGV, call user_signal() instead of handle_segv(), if the architecture
provides the information needed in PTRACE_FAULTINFO, or if PTRACE_FAULTINFO is
missing, because segv-stub will provide the info.
The benefit of the change is, that in case of a non-fixable SIGSEGV, we can
give user processes a SIGSEGV, instead of possibly looping on pagefault
handling.
Since handle_segv() sikked arch_fixup() implicitly by passing ip==0 to segv(),
I changed segv() to call arch_fixup() only, if !is_user.
Signed-off-by: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-05 23:15:31 +00:00
|
|
|
/*
|
|
|
|
* We give a *copy* of the faultinfo in the regs to segv.
|
|
|
|
* This must be done, since nesting SEGVs could overwrite
|
|
|
|
* the info in the regs. A pointer to the info then would
|
|
|
|
* give us bad data!
|
|
|
|
*/
|
2007-05-06 21:51:24 +00:00
|
|
|
unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
|
2007-10-16 08:26:58 +00:00
|
|
|
struct uml_pt_regs *regs)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2007-10-16 08:27:05 +00:00
|
|
|
jmp_buf *catcher;
|
2018-04-16 00:50:48 +00:00
|
|
|
int si_code;
|
2005-04-16 22:20:36 +00:00
|
|
|
int err;
|
2007-05-06 21:51:24 +00:00
|
|
|
int is_write = FAULT_WRITE(fi);
|
|
|
|
unsigned long address = FAULT_ADDRESS(fi);
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2014-07-20 11:39:27 +00:00
|
|
|
if (!is_user && regs)
|
2013-09-23 15:38:02 +00:00
|
|
|
current->thread.segv_regs = container_of(regs, struct pt_regs, regs);
|
|
|
|
|
2024-07-03 13:45:36 +00:00
|
|
|
if (!is_user && init_mm.context.sync_tlb_range_to) {
|
|
|
|
/*
|
|
|
|
* Kernel has pending updates from set_ptes that were not
|
|
|
|
* flushed yet. Syncing them should fix the pagefault (if not
|
|
|
|
* we'll get here again and panic).
|
|
|
|
*/
|
|
|
|
err = um_tlb_sync(&init_mm);
|
|
|
|
if (err == -ENOMEM)
|
|
|
|
report_enomem();
|
|
|
|
if (err)
|
|
|
|
panic("Failed to sync kernel TLBs: %d", err);
|
2013-09-23 15:38:02 +00:00
|
|
|
goto out;
|
2007-05-06 21:51:24 +00:00
|
|
|
}
|
2007-10-16 08:26:54 +00:00
|
|
|
else if (current->mm == NULL) {
|
2007-05-06 21:51:25 +00:00
|
|
|
show_regs(container_of(regs, struct pt_regs, regs));
|
2007-10-16 08:26:54 +00:00
|
|
|
panic("Segfault with no mm");
|
2007-05-06 21:51:25 +00:00
|
|
|
}
|
2015-08-09 20:26:33 +00:00
|
|
|
else if (!is_user && address > PAGE_SIZE && address < TASK_SIZE) {
|
2015-05-31 17:21:51 +00:00
|
|
|
show_regs(container_of(regs, struct pt_regs, regs));
|
|
|
|
panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx",
|
|
|
|
address, ip);
|
|
|
|
}
|
2005-09-23 04:44:16 +00:00
|
|
|
|
2015-03-18 20:31:27 +00:00
|
|
|
if (SEGV_IS_FIXABLE(&fi))
|
2007-10-16 08:26:54 +00:00
|
|
|
err = handle_page_fault(address, ip, is_write, is_user,
|
2018-04-16 00:50:48 +00:00
|
|
|
&si_code);
|
2005-09-23 04:44:16 +00:00
|
|
|
else {
|
|
|
|
err = -EFAULT;
|
2007-10-16 08:26:54 +00:00
|
|
|
/*
|
|
|
|
* A thread accessed NULL, we get a fault, but CR2 is invalid.
|
|
|
|
* This code is used in __do_copy_from_user() of TT mode.
|
|
|
|
* XXX tt mode is gone, so maybe this isn't needed any more
|
|
|
|
*/
|
2005-09-23 04:44:16 +00:00
|
|
|
address = 0;
|
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
catcher = current->thread.fault_catcher;
|
2007-10-16 08:26:54 +00:00
|
|
|
if (!err)
|
2013-09-23 15:38:02 +00:00
|
|
|
goto out;
|
2007-10-16 08:26:54 +00:00
|
|
|
else if (catcher != NULL) {
|
2005-04-16 22:20:36 +00:00
|
|
|
current->thread.fault_addr = (void *) address;
|
2007-10-16 08:27:05 +00:00
|
|
|
UML_LONGJMP(catcher, 1);
|
2006-07-10 11:45:13 +00:00
|
|
|
}
|
2007-10-16 08:26:54 +00:00
|
|
|
else if (current->thread.fault_addr != NULL)
|
2005-04-16 22:20:36 +00:00
|
|
|
panic("fault_addr set but no fault catcher");
|
2007-10-16 08:26:54 +00:00
|
|
|
else if (!is_user && arch_fixup(ip, regs))
|
2013-09-23 15:38:02 +00:00
|
|
|
goto out;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2007-10-16 08:26:54 +00:00
|
|
|
if (!is_user) {
|
2007-05-06 21:51:25 +00:00
|
|
|
show_regs(container_of(regs, struct pt_regs, regs));
|
2006-07-10 11:45:13 +00:00
|
|
|
panic("Kernel mode fault at addr 0x%lx, ip 0x%lx",
|
2005-04-16 22:20:36 +00:00
|
|
|
address, ip);
|
2007-05-06 21:51:25 +00:00
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2011-05-25 00:13:03 +00:00
|
|
|
show_segv_info(regs);
|
|
|
|
|
2005-09-03 22:57:26 +00:00
|
|
|
if (err == -EACCES) {
|
2007-05-06 21:51:24 +00:00
|
|
|
current->thread.arch.faultinfo = fi;
|
2019-05-23 16:04:24 +00:00
|
|
|
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
|
2005-09-03 22:57:26 +00:00
|
|
|
} else {
|
|
|
|
BUG_ON(err != -EFAULT);
|
2007-05-06 21:51:24 +00:00
|
|
|
current->thread.arch.faultinfo = fi;
|
2019-05-23 16:04:24 +00:00
|
|
|
force_sig_fault(SIGSEGV, si_code, (void __user *) address);
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
2013-09-23 15:38:02 +00:00
|
|
|
|
|
|
|
out:
|
|
|
|
if (regs)
|
|
|
|
current->thread.segv_regs = NULL;
|
|
|
|
|
2007-05-06 21:51:24 +00:00
|
|
|
return 0;
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2012-08-01 22:49:17 +00:00
|
|
|
void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2018-04-16 21:12:31 +00:00
|
|
|
int code, err;
|
2007-10-16 08:26:54 +00:00
|
|
|
if (!UPT_IS_USER(regs)) {
|
|
|
|
if (sig == SIGBUS)
|
|
|
|
printk(KERN_ERR "Bus error - the host /dev/shm or /tmp "
|
|
|
|
"mount likely just ran out of space\n");
|
2005-04-16 22:20:36 +00:00
|
|
|
panic("Kernel mode signal %d", sig);
|
2006-09-26 06:33:03 +00:00
|
|
|
}
|
|
|
|
|
uml: further bugs.c tidying
bugs.c, for both i386 and x86_64, can undergo further cleaning -
The i386 arch_check_bugs only does one thing, so we might as
well inline the cmov checking.
The i386 includes can be trimmed down a bit.
arch_init_thread wasn't used, so it is deleted.
The panics in arch_handle_signal are turned into printks
because the process is about to get segfaulted anyway, so something is
dying no matter what happens here. Also, the return value was always
the same, so it contained no information, so it can be void instead.
The name is changed to arch_examine_signal because it doesn't handle
anything.
The caller of arch_handle_signal, relay_signal, does things in
a different order. The kernel-mode signal check is now first, which
puts everything else together, making things a bit clearer conceptually.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 06:30:40 +00:00
|
|
|
arch_examine_signal(sig, regs);
|
|
|
|
|
2018-04-16 21:12:31 +00:00
|
|
|
/* Is the signal layout for the signal known?
|
|
|
|
* Signal data must be scrubbed to prevent information leaks.
|
|
|
|
*/
|
|
|
|
code = si->si_code;
|
|
|
|
err = si->si_errno;
|
|
|
|
if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) {
|
|
|
|
struct faultinfo *fi = UPT_FAULTINFO(regs);
|
2012-08-01 22:49:17 +00:00
|
|
|
current->thread.arch.faultinfo = *fi;
|
2019-05-23 16:04:24 +00:00
|
|
|
force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi));
|
2018-04-16 21:12:31 +00:00
|
|
|
} else {
|
|
|
|
printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n",
|
|
|
|
sig, code, err);
|
2019-05-23 15:17:27 +00:00
|
|
|
force_sig(sig);
|
2012-08-01 22:49:17 +00:00
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2012-08-01 22:49:17 +00:00
|
|
|
void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2007-10-16 08:26:54 +00:00
|
|
|
if (current->thread.fault_catcher != NULL)
|
2007-10-16 08:27:05 +00:00
|
|
|
UML_LONGJMP(current->thread.fault_catcher, 1);
|
2012-08-01 22:49:17 +00:00
|
|
|
else
|
|
|
|
relay_signal(sig, si, regs);
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2012-08-01 22:49:17 +00:00
|
|
|
void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
|
|
|
do_IRQ(WINCH_IRQ, regs);
|
|
|
|
}
|