2019-05-19 12:08:55 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2008-03-03 17:12:54 +00:00
|
|
|
#include <linux/init.h>
|
|
|
|
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/interrupt.h>
|
2016-07-14 00:18:55 +00:00
|
|
|
#include <linux/export.h>
|
x86: Spread tlb flush vector between nodes
Currently flush tlb vector allocation is based on below equation:
sender = smp_processor_id() % 8
This isn't optimal, CPUs from different node can have the same vector, this
causes a lot of lock contention. Instead, we can assign the same vectors to
CPUs from the same node, while different node has different vectors. This has
below advantages:
a. if there is lock contention, the lock contention is between CPUs from one
node. This should be much cheaper than the contention between nodes.
b. completely avoid lock contention between nodes. This especially benefits
kswapd, which is the biggest user of tlb flush, since kswapd sets its affinity
to specific node.
In my test, this could reduce > 20% CPU overhead in extreme case.The test
machine has 4 nodes and each node has 16 CPUs. I then bind each node's kswapd
to the first CPU of the node. I run a workload with 4 sequential mmap file
read thread. The files are empty sparse file. This workload will trigger a
lot of page reclaim and tlbflush. The kswapd bind is to easy trigger the
extreme tlb flush lock contention because otherwise kswapd keeps migrating
between CPUs of a node and I can't get stable result. Sure in real workload,
we can't always see so big tlb flush lock contention, but it's possible.
[ hpa: folded in fix from Eric Dumazet to use this_cpu_read() ]
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
LKML-Reference: <1287544023.4571.8.camel@sli10-conroe.sh.intel.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-10-20 03:07:03 +00:00
|
|
|
#include <linux/cpu.h>
|
2018-01-29 22:04:47 +00:00
|
|
|
#include <linux/debugfs.h>
|
2008-03-03 17:12:54 +00:00
|
|
|
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
#include <asm/mmu_context.h>
|
2018-01-29 22:04:47 +00:00
|
|
|
#include <asm/nospec-branch.h>
|
2009-11-13 11:54:40 +00:00
|
|
|
#include <asm/cache.h>
|
2009-01-21 08:26:06 +00:00
|
|
|
#include <asm/apic.h>
|
perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
The counter value of a perf task may leak to another RDPMC task.
For example, a perf stat task as below is running on CPU 0.
perf stat -e 'branches,cycles' -- taskset -c 0 ./workload
In the meantime, an RDPMC task, which is also running on CPU 0, may read
the GP counters periodically. (The RDPMC task creates a fixed event,
but read four GP counters.)
$./rdpmc_read_all_counters
index 0x0 value 0x8001e5970f99
index 0x1 value 0x8005d750edb6
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x8002358e48a5
index 0x1 value 0x8006bd1e3bc9
index 0x2 value 0x0
index 0x3 value 0x0
It is a potential security issue. Once the attacker knows what the other
thread is counting. The PerfMon counter can be used as a side-channel to
attack cryptosystems.
The counter value of the perf stat task leaks to the RDPMC task because
perf never clears the counter when it's stopped.
Three methods were considered to address the issue.
- Unconditionally reset the counter in x86_pmu_del(). It can bring extra
overhead even when there is no RDPMC task running.
- Only reset the un-assigned dirty counters when the RDPMC task is
scheduled in via sched_task(). It fails for the below case.
Thread A Thread B
clone(CLONE_THREAD) --->
set_affine(0)
set_affine(1)
while (!event-enabled)
;
event = perf_event_open()
mmap(event)
ioctl(event, IOC_ENABLE); --->
RDPMC
Counters are still leaked to the thread B.
- Only reset the un-assigned dirty counters before updating the CR4.PCE
bit. The method is implemented here.
The dirty counter is a counter, on which the assigned event has been
deleted, but the counter is not reset. To track the dirty counters,
add a 'dirty' variable in the struct cpu_hw_events.
The security issue can only be found with an RDPMC task. To enable the
RDMPC, the CR4.PCE bit has to be updated. Add a
perf_clear_dirty_counters() right before updating the CR4.PCE bit to
clear the existing dirty counters. Only the current un-assigned dirty
counters are reset, because the RDPMC assigned dirty counters will be
updated soon.
After applying the patch,
$ ./rdpmc_read_all_counters
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
Performance
The performance of a context switch only be impacted when there are two
or more perf users and one of the users must be an RDPMC user. In other
cases, there is no performance impact.
The worst-case occurs when there are two users: the RDPMC user only
uses one counter; while the other user uses all available counters.
When the RDPMC task is scheduled in, all the counters, other than the
RDPMC assigned one, have to be reset.
Test results for the worst-case, using a modified lat_ctx as measured
on an Ice Lake platform, which has 8 GP and 3 FP counters (ignoring
SLOTS).
lat_ctx -s 128K -N 1000 processes 2
Without the patch:
The context switch time is 4.97 us
With the patch:
The context switch time is 5.16 us
There is ~4% performance drop for the context switching time in the
worst-case.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1623693582-187370-1-git-send-email-kan.liang@linux.intel.com
2021-06-14 17:59:42 +00:00
|
|
|
#include <asm/perf_event.h>
|
2008-03-25 16:28:56 +00:00
|
|
|
|
2018-12-03 17:03:49 +00:00
|
|
|
#include "mm_internal.h"
|
|
|
|
|
2020-04-21 09:20:32 +00:00
|
|
|
#ifdef CONFIG_PARAVIRT
|
|
|
|
# define STATIC_NOPV
|
|
|
|
#else
|
|
|
|
# define STATIC_NOPV static
|
|
|
|
# define __flush_tlb_local native_flush_tlb_local
|
2020-04-21 09:20:33 +00:00
|
|
|
# define __flush_tlb_global native_flush_tlb_global
|
2020-04-21 09:20:34 +00:00
|
|
|
# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
|
2021-02-20 23:17:07 +00:00
|
|
|
# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info)
|
2020-04-21 09:20:32 +00:00
|
|
|
#endif
|
|
|
|
|
2008-03-03 17:12:54 +00:00
|
|
|
/*
|
2017-05-28 17:00:14 +00:00
|
|
|
* TLB flushing, formerly SMP-only
|
2008-03-03 17:12:54 +00:00
|
|
|
* c/o Linus Torvalds.
|
|
|
|
*
|
|
|
|
* These mean you can really definitely utterly forget about
|
|
|
|
* writing to user space from interrupts. (Its not allowed anyway).
|
|
|
|
*
|
|
|
|
* Optimizations Manfred Spraul <manfred@colorfullife.com>
|
|
|
|
*
|
|
|
|
* More scalable flush, from Andi Kleen
|
|
|
|
*
|
2012-06-28 01:02:23 +00:00
|
|
|
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
|
2008-03-03 17:12:54 +00:00
|
|
|
*/
|
|
|
|
|
2018-11-25 18:33:49 +00:00
|
|
|
/*
|
2021-01-08 12:10:53 +00:00
|
|
|
* Bits to mangle the TIF_SPEC_IB state into the mm pointer which is
|
|
|
|
* stored in cpu_tlb_state.last_user_mm_spec.
|
2018-11-25 18:33:49 +00:00
|
|
|
*/
|
|
|
|
#define LAST_USER_MM_IBPB 0x1UL
|
2021-01-08 12:10:53 +00:00
|
|
|
#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB)
|
|
|
|
|
|
|
|
/* Bits to set when tlbstate and flush is (re)initialized */
|
|
|
|
#define LAST_USER_MM_INIT LAST_USER_MM_IBPB
|
2018-11-25 18:33:49 +00:00
|
|
|
|
2020-04-21 09:20:41 +00:00
|
|
|
/*
|
|
|
|
* The x86 feature is called PCID (Process Context IDentifier). It is similar
|
|
|
|
* to what is traditionally called ASID on the RISC processors.
|
|
|
|
*
|
|
|
|
* We don't use the traditional ASID implementation, where each process/mm gets
|
|
|
|
* its own ASID and flush/restart when we run out of ASID space.
|
|
|
|
*
|
|
|
|
* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
|
|
|
|
* that came by on this CPU, allowing cheaper switch_mm between processes on
|
|
|
|
* this CPU.
|
|
|
|
*
|
|
|
|
* We end up with different spaces for different things. To avoid confusion we
|
|
|
|
* use different names for each of them:
|
|
|
|
*
|
|
|
|
* ASID - [0, TLB_NR_DYN_ASIDS-1]
|
|
|
|
* the canonical identifier for an mm
|
|
|
|
*
|
|
|
|
* kPCID - [1, TLB_NR_DYN_ASIDS]
|
|
|
|
* the value we write into the PCID part of CR3; corresponds to the
|
|
|
|
* ASID+1, because PCID 0 is special.
|
|
|
|
*
|
|
|
|
* uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
|
|
|
|
* for KPTI each mm has two address spaces and thus needs two
|
|
|
|
* PCID values, but we can still do with a single ASID denomination
|
|
|
|
* for each mm. Corresponds to kPCID + 2048.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* There are 12 bits of space for ASIDS in CR3 */
|
|
|
|
#define CR3_HW_ASID_BITS 12
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
|
|
|
|
* user/kernel switches
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
|
|
# define PTI_CONSUMED_PCID_BITS 1
|
|
|
|
#else
|
|
|
|
# define PTI_CONSUMED_PCID_BITS 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
|
|
|
|
* for them being zero-based. Another -1 is because PCID 0 is reserved for
|
|
|
|
* use by non-PCID-aware users.
|
|
|
|
*/
|
|
|
|
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given @asid, compute kPCID
|
|
|
|
*/
|
|
|
|
static inline u16 kern_pcid(u16 asid)
|
|
|
|
{
|
|
|
|
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
|
|
|
|
|
|
|
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
|
|
/*
|
2021-03-18 14:28:01 +00:00
|
|
|
* Make sure that the dynamic ASID space does not conflict with the
|
2020-04-21 09:20:41 +00:00
|
|
|
* bit we are using to switch between user and kernel ASIDs.
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The ASID being passed in here should have respected the
|
|
|
|
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
|
|
|
|
*/
|
|
|
|
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* The dynamically-assigned ASIDs that get passed in are small
|
|
|
|
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
|
|
|
|
* so do not bother to clear it.
|
|
|
|
*
|
|
|
|
* If PCID is on, ASID-aware code paths put the ASID+1 into the
|
|
|
|
* PCID bits. This serves two purposes. It prevents a nasty
|
|
|
|
* situation in which PCID-unaware code saves CR3, loads some other
|
|
|
|
* value (with PCID == 0), and then restores CR3, thus corrupting
|
|
|
|
* the TLB for ASID 0 if the saved ASID was nonzero. It also means
|
|
|
|
* that any bugs involving loading a PCID-enabled CR3 with
|
|
|
|
* CR4.PCIDE off will trigger deterministically.
|
|
|
|
*/
|
|
|
|
return asid + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given @asid, compute uPCID
|
|
|
|
*/
|
|
|
|
static inline u16 user_pcid(u16 asid)
|
|
|
|
{
|
|
|
|
u16 ret = kern_pcid(asid);
|
|
|
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
|
|
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
|
|
|
|
#endif
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
|
|
|
|
{
|
|
|
|
if (static_cpu_has(X86_FEATURE_PCID)) {
|
|
|
|
return __sme_pa(pgd) | kern_pcid(asid);
|
|
|
|
} else {
|
|
|
|
VM_WARN_ON_ONCE(asid != 0);
|
|
|
|
return __sme_pa(pgd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
|
|
|
|
{
|
|
|
|
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
|
|
|
|
/*
|
|
|
|
* Use boot_cpu_has() instead of this_cpu_has() as this function
|
|
|
|
* might be called during early boot. This should work even after
|
|
|
|
* boot because all CPU's the have same capabilities:
|
|
|
|
*/
|
|
|
|
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
|
|
|
|
return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
|
|
|
|
}
|
|
|
|
|
2017-12-04 14:07:57 +00:00
|
|
|
/*
|
|
|
|
* We get here when we do something requiring a TLB invalidation
|
|
|
|
* but could not go invalidate all of the contexts. We do the
|
|
|
|
* necessary invalidation by clearing out the 'ctx_id' which
|
|
|
|
* forces a TLB flush when the context is loaded.
|
|
|
|
*/
|
2018-07-21 07:55:32 +00:00
|
|
|
static void clear_asid_other(void)
|
2017-12-04 14:07:57 +00:00
|
|
|
{
|
|
|
|
u16 asid;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is only expected to be set if we have disabled
|
|
|
|
* kernel _PAGE_GLOBAL pages.
|
|
|
|
*/
|
|
|
|
if (!static_cpu_has(X86_FEATURE_PTI)) {
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
|
|
|
|
/* Do not need to flush the current asid */
|
|
|
|
if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* Make sure the next time we go to switch to
|
|
|
|
* this asid, we do a flush:
|
|
|
|
*/
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
|
|
|
|
}
|
|
|
|
this_cpu_write(cpu_tlbstate.invalidate_other, false);
|
|
|
|
}
|
|
|
|
|
2017-06-29 15:53:15 +00:00
|
|
|
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
|
|
|
|
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
|
|
|
|
u16 *new_asid, bool *need_flush)
|
|
|
|
{
|
|
|
|
u16 asid;
|
|
|
|
|
|
|
|
if (!static_cpu_has(X86_FEATURE_PCID)) {
|
|
|
|
*new_asid = 0;
|
|
|
|
*need_flush = true;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-12-04 14:07:57 +00:00
|
|
|
if (this_cpu_read(cpu_tlbstate.invalidate_other))
|
|
|
|
clear_asid_other();
|
|
|
|
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
|
|
|
|
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
|
|
|
|
next->context.ctx_id)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
*new_asid = asid;
|
|
|
|
*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
|
|
|
|
next_tlb_gen);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't currently own an ASID slot on this CPU.
|
|
|
|
* Allocate a slot.
|
|
|
|
*/
|
|
|
|
*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
|
|
|
|
if (*new_asid >= TLB_NR_DYN_ASIDS) {
|
|
|
|
*new_asid = 0;
|
|
|
|
this_cpu_write(cpu_tlbstate.next_asid, 1);
|
|
|
|
}
|
|
|
|
*need_flush = true;
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:34 +00:00
|
|
|
/*
|
|
|
|
* Given an ASID, flush the corresponding user ASID. We can delay this
|
|
|
|
* until the next time we switch to it.
|
|
|
|
*
|
|
|
|
* See SWITCH_TO_USER_CR3.
|
|
|
|
*/
|
|
|
|
static inline void invalidate_user_asid(u16 asid)
|
|
|
|
{
|
|
|
|
/* There is no user ASID if address space separation is off */
|
|
|
|
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We only have a single ASID if PCID is off and the CR3
|
|
|
|
* write will have flushed it.
|
|
|
|
*/
|
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_PCID))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!static_cpu_has(X86_FEATURE_PTI))
|
|
|
|
return;
|
|
|
|
|
|
|
|
__set_bit(kern_pcid(asid),
|
|
|
|
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
|
|
|
|
}
|
|
|
|
|
2017-12-04 14:07:58 +00:00
|
|
|
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
|
|
|
|
{
|
|
|
|
unsigned long new_mm_cr3;
|
|
|
|
|
|
|
|
if (need_flush) {
|
2017-12-04 14:07:59 +00:00
|
|
|
invalidate_user_asid(new_asid);
|
2017-12-04 14:07:58 +00:00
|
|
|
new_mm_cr3 = build_cr3(pgdir, new_asid);
|
|
|
|
} else {
|
|
|
|
new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Caution: many callers of this function expect
|
|
|
|
* that load_cr3() is serializing and orders TLB
|
|
|
|
* fills with respect to the mm_cpumask writes.
|
|
|
|
*/
|
|
|
|
write_cr3(new_mm_cr3);
|
|
|
|
}
|
|
|
|
|
2008-03-03 17:12:54 +00:00
|
|
|
void leave_mm(int cpu)
|
|
|
|
{
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* It's plausible that we're in lazy TLB mode while our mm is init_mm.
|
|
|
|
* If so, our callers still expect us to flush the TLB, but there
|
|
|
|
* aren't any user TLB entries in init_mm to worry about.
|
|
|
|
*
|
|
|
|
* This needs to happen before any other sanity checks due to
|
|
|
|
* intel_idle's shenanigans.
|
|
|
|
*/
|
|
|
|
if (loaded_mm == &init_mm)
|
|
|
|
return;
|
|
|
|
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
/* Warn if we're not lazy. */
|
2021-02-20 23:17:08 +00:00
|
|
|
WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
|
|
|
|
switch_mm(NULL, &init_mm, NULL);
|
2008-03-03 17:12:54 +00:00
|
|
|
}
|
2017-11-04 11:16:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(leave_mm);
|
2008-03-03 17:12:54 +00:00
|
|
|
|
2016-04-26 16:39:08 +00:00
|
|
|
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|
|
|
struct task_struct *tsk)
|
2016-04-26 16:39:09 +00:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
switch_mm_irqs_off(prev, next, tsk);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
2021-01-08 12:10:53 +00:00
|
|
|
static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
|
2018-11-25 18:33:49 +00:00
|
|
|
{
|
|
|
|
unsigned long next_tif = task_thread_info(next)->flags;
|
2021-01-08 12:10:53 +00:00
|
|
|
unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
|
2018-11-25 18:33:49 +00:00
|
|
|
|
2021-01-08 12:10:53 +00:00
|
|
|
return (unsigned long)next->mm | spec_bits;
|
2018-11-25 18:33:49 +00:00
|
|
|
}
|
|
|
|
|
2021-01-08 12:10:53 +00:00
|
|
|
static void cond_mitigation(struct task_struct *next)
|
2018-09-25 12:38:18 +00:00
|
|
|
{
|
2021-01-08 12:10:53 +00:00
|
|
|
unsigned long prev_mm, next_mm;
|
|
|
|
|
2018-11-25 18:33:49 +00:00
|
|
|
if (!next || !next->mm)
|
|
|
|
return;
|
|
|
|
|
2021-01-08 12:10:53 +00:00
|
|
|
next_mm = mm_mangle_tif_spec_bits(next);
|
|
|
|
prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
|
|
|
|
|
2018-09-25 12:38:18 +00:00
|
|
|
/*
|
2021-01-08 12:10:53 +00:00
|
|
|
* Avoid user/user BTB poisoning by flushing the branch predictor
|
|
|
|
* when switching between processes. This stops one process from
|
|
|
|
* doing Spectre-v2 attacks on another.
|
|
|
|
*
|
2018-11-25 18:33:49 +00:00
|
|
|
* Both, the conditional and the always IBPB mode use the mm
|
|
|
|
* pointer to avoid the IBPB when switching between tasks of the
|
|
|
|
* same process. Using the mm pointer instead of mm->context.ctx_id
|
|
|
|
* opens a hypothetical hole vs. mm_struct reuse, which is more or
|
|
|
|
* less impossible to control by an attacker. Aside of that it
|
|
|
|
* would only affect the first schedule so the theoretically
|
|
|
|
* exposed data is not really interesting.
|
2018-09-25 12:38:18 +00:00
|
|
|
*/
|
2018-11-25 18:33:49 +00:00
|
|
|
if (static_branch_likely(&switch_mm_cond_ibpb)) {
|
|
|
|
/*
|
|
|
|
* This is a bit more complex than the always mode because
|
|
|
|
* it has to handle two cases:
|
|
|
|
*
|
|
|
|
* 1) Switch from a user space task (potential attacker)
|
|
|
|
* which has TIF_SPEC_IB set to a user space task
|
|
|
|
* (potential victim) which has TIF_SPEC_IB not set.
|
|
|
|
*
|
|
|
|
* 2) Switch from a user space task (potential attacker)
|
|
|
|
* which has TIF_SPEC_IB not set to a user space task
|
|
|
|
* (potential victim) which has TIF_SPEC_IB set.
|
|
|
|
*
|
|
|
|
* This could be done by unconditionally issuing IBPB when
|
|
|
|
* a task which has TIF_SPEC_IB set is either scheduled in
|
|
|
|
* or out. Though that results in two flushes when:
|
|
|
|
*
|
|
|
|
* - the same user space task is scheduled out and later
|
|
|
|
* scheduled in again and only a kernel thread ran in
|
|
|
|
* between.
|
|
|
|
*
|
|
|
|
* - a user space task belonging to the same process is
|
|
|
|
* scheduled in after a kernel thread ran in between
|
|
|
|
*
|
|
|
|
* - a user space task belonging to the same process is
|
|
|
|
* scheduled in immediately.
|
|
|
|
*
|
|
|
|
* Optimize this with reasonably small overhead for the
|
|
|
|
* above cases. Mangle the TIF_SPEC_IB bit into the mm
|
|
|
|
* pointer of the incoming task which is stored in
|
2021-01-08 12:10:53 +00:00
|
|
|
* cpu_tlbstate.last_user_mm_spec for comparison.
|
|
|
|
*
|
2018-11-25 18:33:49 +00:00
|
|
|
* Issue IBPB only if the mm's are different and one or
|
|
|
|
* both have the IBPB bit set.
|
|
|
|
*/
|
|
|
|
if (next_mm != prev_mm &&
|
|
|
|
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
|
|
|
|
indirect_branch_prediction_barrier();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (static_branch_unlikely(&switch_mm_always_ibpb)) {
|
|
|
|
/*
|
|
|
|
* Only flush when switching to a user space task with a
|
|
|
|
* different context than the user space task which ran
|
|
|
|
* last on this CPU.
|
|
|
|
*/
|
2021-01-08 12:10:53 +00:00
|
|
|
if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
|
|
|
|
(unsigned long)next->mm)
|
2018-11-25 18:33:49 +00:00
|
|
|
indirect_branch_prediction_barrier();
|
|
|
|
}
|
2021-01-08 12:10:53 +00:00
|
|
|
|
|
|
|
this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
|
2018-09-25 12:38:18 +00:00
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:30 +00:00
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
|
|
static inline void cr4_update_pce_mm(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
if (static_branch_unlikely(&rdpmc_always_available_key) ||
|
|
|
|
(!static_branch_unlikely(&rdpmc_never_available_key) &&
|
perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
The counter value of a perf task may leak to another RDPMC task.
For example, a perf stat task as below is running on CPU 0.
perf stat -e 'branches,cycles' -- taskset -c 0 ./workload
In the meantime, an RDPMC task, which is also running on CPU 0, may read
the GP counters periodically. (The RDPMC task creates a fixed event,
but read four GP counters.)
$./rdpmc_read_all_counters
index 0x0 value 0x8001e5970f99
index 0x1 value 0x8005d750edb6
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x8002358e48a5
index 0x1 value 0x8006bd1e3bc9
index 0x2 value 0x0
index 0x3 value 0x0
It is a potential security issue. Once the attacker knows what the other
thread is counting. The PerfMon counter can be used as a side-channel to
attack cryptosystems.
The counter value of the perf stat task leaks to the RDPMC task because
perf never clears the counter when it's stopped.
Three methods were considered to address the issue.
- Unconditionally reset the counter in x86_pmu_del(). It can bring extra
overhead even when there is no RDPMC task running.
- Only reset the un-assigned dirty counters when the RDPMC task is
scheduled in via sched_task(). It fails for the below case.
Thread A Thread B
clone(CLONE_THREAD) --->
set_affine(0)
set_affine(1)
while (!event-enabled)
;
event = perf_event_open()
mmap(event)
ioctl(event, IOC_ENABLE); --->
RDPMC
Counters are still leaked to the thread B.
- Only reset the un-assigned dirty counters before updating the CR4.PCE
bit. The method is implemented here.
The dirty counter is a counter, on which the assigned event has been
deleted, but the counter is not reset. To track the dirty counters,
add a 'dirty' variable in the struct cpu_hw_events.
The security issue can only be found with an RDPMC task. To enable the
RDMPC, the CR4.PCE bit has to be updated. Add a
perf_clear_dirty_counters() right before updating the CR4.PCE bit to
clear the existing dirty counters. Only the current un-assigned dirty
counters are reset, because the RDPMC assigned dirty counters will be
updated soon.
After applying the patch,
$ ./rdpmc_read_all_counters
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
Performance
The performance of a context switch only be impacted when there are two
or more perf users and one of the users must be an RDPMC user. In other
cases, there is no performance impact.
The worst-case occurs when there are two users: the RDPMC user only
uses one counter; while the other user uses all available counters.
When the RDPMC task is scheduled in, all the counters, other than the
RDPMC assigned one, have to be reset.
Test results for the worst-case, using a modified lat_ctx as measured
on an Ice Lake platform, which has 8 GP and 3 FP counters (ignoring
SLOTS).
lat_ctx -s 128K -N 1000 processes 2
Without the patch:
The context switch time is 4.97 us
With the patch:
The context switch time is 5.16 us
There is ~4% performance drop for the context switching time in the
worst-case.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1623693582-187370-1-git-send-email-kan.liang@linux.intel.com
2021-06-14 17:59:42 +00:00
|
|
|
atomic_read(&mm->context.perf_rdpmc_allowed))) {
|
|
|
|
/*
|
|
|
|
* Clear the existing dirty counters to
|
|
|
|
* prevent the leak for an RDPMC task.
|
|
|
|
*/
|
|
|
|
perf_clear_dirty_counters();
|
2020-04-21 09:20:30 +00:00
|
|
|
cr4_set_bits_irqsoff(X86_CR4_PCE);
|
perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
The counter value of a perf task may leak to another RDPMC task.
For example, a perf stat task as below is running on CPU 0.
perf stat -e 'branches,cycles' -- taskset -c 0 ./workload
In the meantime, an RDPMC task, which is also running on CPU 0, may read
the GP counters periodically. (The RDPMC task creates a fixed event,
but read four GP counters.)
$./rdpmc_read_all_counters
index 0x0 value 0x8001e5970f99
index 0x1 value 0x8005d750edb6
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x8002358e48a5
index 0x1 value 0x8006bd1e3bc9
index 0x2 value 0x0
index 0x3 value 0x0
It is a potential security issue. Once the attacker knows what the other
thread is counting. The PerfMon counter can be used as a side-channel to
attack cryptosystems.
The counter value of the perf stat task leaks to the RDPMC task because
perf never clears the counter when it's stopped.
Three methods were considered to address the issue.
- Unconditionally reset the counter in x86_pmu_del(). It can bring extra
overhead even when there is no RDPMC task running.
- Only reset the un-assigned dirty counters when the RDPMC task is
scheduled in via sched_task(). It fails for the below case.
Thread A Thread B
clone(CLONE_THREAD) --->
set_affine(0)
set_affine(1)
while (!event-enabled)
;
event = perf_event_open()
mmap(event)
ioctl(event, IOC_ENABLE); --->
RDPMC
Counters are still leaked to the thread B.
- Only reset the un-assigned dirty counters before updating the CR4.PCE
bit. The method is implemented here.
The dirty counter is a counter, on which the assigned event has been
deleted, but the counter is not reset. To track the dirty counters,
add a 'dirty' variable in the struct cpu_hw_events.
The security issue can only be found with an RDPMC task. To enable the
RDMPC, the CR4.PCE bit has to be updated. Add a
perf_clear_dirty_counters() right before updating the CR4.PCE bit to
clear the existing dirty counters. Only the current un-assigned dirty
counters are reset, because the RDPMC assigned dirty counters will be
updated soon.
After applying the patch,
$ ./rdpmc_read_all_counters
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
Performance
The performance of a context switch only be impacted when there are two
or more perf users and one of the users must be an RDPMC user. In other
cases, there is no performance impact.
The worst-case occurs when there are two users: the RDPMC user only
uses one counter; while the other user uses all available counters.
When the RDPMC task is scheduled in, all the counters, other than the
RDPMC assigned one, have to be reset.
Test results for the worst-case, using a modified lat_ctx as measured
on an Ice Lake platform, which has 8 GP and 3 FP counters (ignoring
SLOTS).
lat_ctx -s 128K -N 1000 processes 2
Without the patch:
The context switch time is 4.97 us
With the patch:
The context switch time is 5.16 us
There is ~4% performance drop for the context switching time in the
worst-case.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1623693582-187370-1-git-send-email-kan.liang@linux.intel.com
2021-06-14 17:59:42 +00:00
|
|
|
} else
|
2020-04-21 09:20:30 +00:00
|
|
|
cr4_clear_bits_irqsoff(X86_CR4_PCE);
|
|
|
|
}
|
|
|
|
|
|
|
|
void cr4_update_pce(void *ignored)
|
|
|
|
{
|
|
|
|
cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
|
|
|
|
#endif
|
|
|
|
|
2016-04-26 16:39:09 +00:00
|
|
|
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|
|
|
struct task_struct *tsk)
|
2016-04-26 16:39:08 +00:00
|
|
|
{
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
2021-02-20 23:17:08 +00:00
|
|
|
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
unsigned cpu = smp_processor_id();
|
|
|
|
u64 next_tlb_gen;
|
2018-09-26 03:58:39 +00:00
|
|
|
bool need_flush;
|
|
|
|
u16 new_asid;
|
2016-04-26 16:39:08 +00:00
|
|
|
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
/*
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
* NB: The scheduler will call us with prev == next when switching
|
|
|
|
* from lazy TLB mode to normal mode if active_mm isn't changing.
|
|
|
|
* When this happens, we don't assume that CR3 (and hence
|
|
|
|
* cpu_tlbstate.loaded_mm) matches next.
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
*
|
|
|
|
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
|
|
|
|
*/
|
2016-08-11 09:35:23 +00:00
|
|
|
|
2021-02-20 23:17:05 +00:00
|
|
|
/* We don't want flush_tlb_func() to run concurrently with us. */
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
|
|
|
|
WARN_ON_ONCE(!irqs_disabled());
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify that CR3 is what we think it is. This will catch
|
|
|
|
* hypothetical buggy code that directly switches to swapper_pg_dir
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
* without going through leave_mm() / switch_mm_irqs_off() or that
|
|
|
|
* does something like write_cr3(read_cr3_pa()).
|
2017-09-08 05:06:57 +00:00
|
|
|
*
|
|
|
|
* Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
|
|
|
|
* isn't free.
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
*/
|
2017-09-08 05:06:57 +00:00
|
|
|
#ifdef CONFIG_DEBUG_VM
|
2017-12-04 14:07:54 +00:00
|
|
|
if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
|
2017-09-08 05:06:57 +00:00
|
|
|
/*
|
|
|
|
* If we were to BUG here, we'd be very likely to kill
|
|
|
|
* the system so hard that we don't see the call trace.
|
|
|
|
* Try to recover instead by ignoring the error and doing
|
|
|
|
* a global flush to minimize the chance of corruption.
|
|
|
|
*
|
|
|
|
* (This is far from being a fully correct recovery.
|
|
|
|
* Architecturally, the CPU could prefetch something
|
|
|
|
* back into an incorrect ASID slot and leave it there
|
|
|
|
* to cause trouble down the road. It's better than
|
|
|
|
* nothing, though.)
|
|
|
|
*/
|
|
|
|
__flush_tlb_all();
|
|
|
|
}
|
|
|
|
#endif
|
2021-02-20 23:17:09 +00:00
|
|
|
if (was_lazy)
|
|
|
|
this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
|
2016-08-11 09:35:23 +00:00
|
|
|
|
2018-01-29 20:20:12 +00:00
|
|
|
/*
|
membarrier/x86: Provide core serializing command
There are two places where core serialization is needed by membarrier:
1) When returning from the membarrier IPI,
2) After scheduler updates curr to a thread with a different mm, before
going back to user-space, since the curr->mm is used by membarrier to
check whether it needs to send an IPI to that CPU.
x86-32 uses IRET as return from interrupt, and both IRET and SYSEXIT to go
back to user-space. The IRET instruction is core serializing, but not
SYSEXIT.
x86-64 uses IRET as return from interrupt, which takes care of the IPI.
However, it can return to user-space through either SYSRETL (compat
code), SYSRETQ, or IRET. Given that SYSRET{L,Q} is not core serializing,
we rely instead on write_cr3() performed by switch_mm() to provide core
serialization after changing the current mm, and deal with the special
case of kthread -> uthread (temporarily keeping current mm into
active_mm) by adding a sync_core() in that specific case.
Use the new sync_core_before_usermode() to guarantee this.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Andrew Hunter <ahh@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Avi Kivity <avi@scylladb.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: David Sehr <sehr@google.com>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maged Michael <maged.michael@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Link: http://lkml.kernel.org/r/20180129202020.8515-10-mathieu.desnoyers@efficios.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-01-29 20:20:18 +00:00
|
|
|
* The membarrier system call requires a full memory barrier and
|
|
|
|
* core serialization before returning to user-space, after
|
2020-12-04 05:07:03 +00:00
|
|
|
* storing to rq->curr, when changing mm. This is because
|
|
|
|
* membarrier() sends IPIs to all CPUs that are in the target mm
|
|
|
|
* to make them issue memory barriers. However, if another CPU
|
|
|
|
* switches to/from the target mm concurrently with
|
|
|
|
* membarrier(), it can cause that CPU not to receive an IPI
|
|
|
|
* when it really should issue a memory barrier. Writing to CR3
|
|
|
|
* provides that full memory barrier and core serializing
|
|
|
|
* instruction.
|
2018-01-29 20:20:12 +00:00
|
|
|
*/
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
if (real_prev == next) {
|
2017-10-14 16:59:49 +00:00
|
|
|
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
|
|
|
next->context.ctx_id);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
|
2016-04-26 16:39:08 +00:00
|
|
|
/*
|
2018-09-26 03:58:44 +00:00
|
|
|
* Even in lazy TLB mode, the CPU should stay set in the
|
|
|
|
* mm_cpumask. The TLB shootdown code can figure out from
|
2021-02-20 23:17:08 +00:00
|
|
|
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
|
2016-04-26 16:39:08 +00:00
|
|
|
*/
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
if (WARN_ON_ONCE(real_prev != &init_mm &&
|
|
|
|
!cpumask_test_cpu(cpu, mm_cpumask(next))))
|
|
|
|
cpumask_set_cpu(cpu, mm_cpumask(next));
|
|
|
|
|
2018-09-26 03:58:44 +00:00
|
|
|
/*
|
|
|
|
* If the CPU is not in lazy TLB mode, we are just switching
|
|
|
|
* from one thread in a process to another thread in the same
|
|
|
|
* process. No TLB flush required.
|
|
|
|
*/
|
|
|
|
if (!was_lazy)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read the tlb_gen to check whether a flush is needed.
|
|
|
|
* If the TLB is up to date, just use it.
|
|
|
|
* The barrier synchronizes with the tlb_gen increment in
|
|
|
|
* the TLB shootdown code.
|
|
|
|
*/
|
|
|
|
smp_mb();
|
|
|
|
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
|
|
|
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
|
|
|
|
next_tlb_gen)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TLB contents went out of date while we were in lazy
|
|
|
|
* mode. Fall through to the TLB switching code below.
|
|
|
|
*/
|
|
|
|
new_asid = prev_asid;
|
|
|
|
need_flush = true;
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
} else {
|
2018-01-29 22:04:47 +00:00
|
|
|
/*
|
2021-01-08 12:10:53 +00:00
|
|
|
* Apply process to process speculation vulnerability
|
|
|
|
* mitigations if applicable.
|
2018-01-29 22:04:47 +00:00
|
|
|
*/
|
2021-01-08 12:10:53 +00:00
|
|
|
cond_mitigation(tsk);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
|
2018-07-16 19:03:37 +00:00
|
|
|
/*
|
|
|
|
* Stop remote flushes for the previous mm.
|
|
|
|
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
|
|
|
|
* but the bitmap manipulation can cause cache line contention.
|
|
|
|
*/
|
|
|
|
if (real_prev != &init_mm) {
|
|
|
|
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
|
|
|
|
mm_cpumask(real_prev)));
|
|
|
|
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
|
|
|
|
}
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
/*
|
|
|
|
* Start remote flushes and then read tlb_gen.
|
|
|
|
*/
|
2018-07-16 19:03:37 +00:00
|
|
|
if (next != &init_mm)
|
|
|
|
cpumask_set_cpu(cpu, mm_cpumask(next));
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
|
2018-08-29 15:47:18 +00:00
|
|
|
/* Let nmi_uaccess_okay() know that we're changing CR3. */
|
|
|
|
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
|
|
|
barrier();
|
2018-09-26 03:58:39 +00:00
|
|
|
}
|
2018-08-29 15:47:18 +00:00
|
|
|
|
2018-09-26 03:58:39 +00:00
|
|
|
if (need_flush) {
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
|
|
|
load_new_mm_cr3(next->pgd, new_asid, true);
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
|
2020-08-12 10:22:17 +00:00
|
|
|
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
2018-09-26 03:58:39 +00:00
|
|
|
} else {
|
|
|
|
/* The new ASID is already up to date. */
|
|
|
|
load_new_mm_cr3(next->pgd, new_asid, false);
|
2018-08-29 15:47:18 +00:00
|
|
|
|
2020-08-12 10:22:17 +00:00
|
|
|
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
}
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
|
2018-09-26 03:58:39 +00:00
|
|
|
/* Make sure we write CR3 before loaded_mm. */
|
|
|
|
barrier();
|
|
|
|
|
|
|
|
this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
|
|
|
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
|
|
|
|
|
2018-09-26 03:58:44 +00:00
|
|
|
if (next != real_prev) {
|
2020-04-21 09:20:30 +00:00
|
|
|
cr4_update_pce_mm(next);
|
2018-09-26 03:58:44 +00:00
|
|
|
switch_ldt(real_prev, next);
|
|
|
|
}
|
2016-04-26 16:39:08 +00:00
|
|
|
}
|
|
|
|
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
/*
|
2017-10-14 16:59:50 +00:00
|
|
|
* Please ignore the name of this function. It should be called
|
|
|
|
* switch_to_kernel_thread().
|
|
|
|
*
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
|
|
|
|
* kernel thread or other context without an mm. Acceptable implementations
|
|
|
|
* include doing nothing whatsoever, switching to init_mm, or various clever
|
|
|
|
* lazy tricks to try to minimize TLB flushes.
|
|
|
|
*
|
|
|
|
* The scheduler reserves the right to call enter_lazy_tlb() several times
|
|
|
|
* in a row. It will notify us that we're going back to a real mm by
|
|
|
|
* calling switch_mm_irqs_off().
|
|
|
|
*/
|
|
|
|
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
|
|
|
|
return;
|
|
|
|
|
2021-02-20 23:17:08 +00:00
|
|
|
this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
}
|
|
|
|
|
2017-09-07 02:54:53 +00:00
|
|
|
/*
|
|
|
|
* Call this when reinitializing a CPU. It fixes the following potential
|
|
|
|
* problems:
|
|
|
|
*
|
|
|
|
* - The ASID changed from what cpu_tlbstate thinks it is (most likely
|
|
|
|
* because the CPU was taken down and came back up with CR3's PCID
|
|
|
|
* bits clear. CPU hotplug can do this.
|
|
|
|
*
|
|
|
|
* - The TLB contains junk in slots corresponding to inactive ASIDs.
|
|
|
|
*
|
|
|
|
* - The CPU went so far out to lunch that it may have missed a TLB
|
|
|
|
* flush.
|
|
|
|
*/
|
|
|
|
void initialize_tlbstate_and_flush(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
|
|
|
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
|
|
|
|
unsigned long cr3 = __read_cr3();
|
|
|
|
|
|
|
|
/* Assert that CR3 already references the right mm. */
|
|
|
|
WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
|
|
|
|
* doesn't work like other CR4 bits because it can only be set from
|
|
|
|
* long mode.)
|
|
|
|
*/
|
2017-09-10 15:52:58 +00:00
|
|
|
WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
|
2017-09-07 02:54:53 +00:00
|
|
|
!(cr4_read_shadow() & X86_CR4_PCIDE));
|
|
|
|
|
|
|
|
/* Force ASID 0 and force a TLB flush. */
|
2017-12-04 14:07:54 +00:00
|
|
|
write_cr3(build_cr3(mm->pgd, 0));
|
2017-09-07 02:54:53 +00:00
|
|
|
|
|
|
|
/* Reinitialize tlbstate. */
|
2021-01-08 12:10:53 +00:00
|
|
|
this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
|
2017-09-07 02:54:53 +00:00
|
|
|
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
|
|
|
|
this_cpu_write(cpu_tlbstate.next_asid, 1);
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
|
|
|
|
|
|
|
|
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
|
|
|
|
}
|
|
|
|
|
2017-06-29 15:53:16 +00:00
|
|
|
/*
|
2021-02-20 23:17:05 +00:00
|
|
|
* flush_tlb_func()'s memory ordering requirement is that any
|
2017-06-29 15:53:16 +00:00
|
|
|
* TLB fills that happen after we flush the TLB are ordered after we
|
|
|
|
* read active_mm's tlb_gen. We don't need any explicit barriers
|
|
|
|
* because all x86 flush operations are serializing and the
|
|
|
|
* atomic64_read operation won't be reordered by the compiler.
|
|
|
|
*/
|
2021-02-20 23:17:05 +00:00
|
|
|
static void flush_tlb_func(void *info)
|
2008-03-03 17:12:54 +00:00
|
|
|
{
|
2017-06-29 15:53:16 +00:00
|
|
|
/*
|
|
|
|
* We have three different tlb_gen values in here. They are:
|
|
|
|
*
|
|
|
|
* - mm_tlb_gen: the latest generation.
|
|
|
|
* - local_tlb_gen: the generation that this CPU has already caught
|
|
|
|
* up to.
|
|
|
|
* - f->new_tlb_gen: the generation that the requester of the flush
|
|
|
|
* wants us to catch up to.
|
|
|
|
*/
|
2021-02-20 23:17:05 +00:00
|
|
|
const struct flush_tlb_info *f = info;
|
2017-06-29 15:53:16 +00:00
|
|
|
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
2017-06-29 15:53:16 +00:00
|
|
|
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
|
2021-02-20 23:17:05 +00:00
|
|
|
bool local = smp_processor_id() == f->initiating_cpu;
|
|
|
|
unsigned long nr_invalidate = 0;
|
2017-06-29 15:53:16 +00:00
|
|
|
|
2017-06-29 15:53:13 +00:00
|
|
|
/* This code cannot presently handle being reentered. */
|
|
|
|
VM_WARN_ON(!irqs_disabled());
|
|
|
|
|
2021-02-20 23:17:05 +00:00
|
|
|
if (!local) {
|
|
|
|
inc_irq_stat(irq_tlb_count);
|
|
|
|
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
|
|
|
|
|
|
|
|
/* Can only happen on remote CPUs */
|
|
|
|
if (f->mm && f->mm != loaded_mm)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
if (unlikely(loaded_mm == &init_mm))
|
|
|
|
return;
|
|
|
|
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
|
2017-06-29 15:53:16 +00:00
|
|
|
loaded_mm->context.ctx_id);
|
|
|
|
|
2021-02-20 23:17:08 +00:00
|
|
|
if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) {
|
2017-06-29 15:53:16 +00:00
|
|
|
/*
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
* We're in lazy mode. We need to at least flush our
|
|
|
|
* paging-structure cache to avoid speculatively reading
|
|
|
|
* garbage into our TLB. Since switching to init_mm is barely
|
|
|
|
* slower than a minimal flush, just switch to init_mm.
|
2018-09-26 03:58:44 +00:00
|
|
|
*
|
2021-02-20 23:17:07 +00:00
|
|
|
* This should be rare, with native_flush_tlb_multi() skipping
|
2018-09-26 03:58:44 +00:00
|
|
|
* IPIs to lazy TLB mode CPUs.
|
2017-06-29 15:53:16 +00:00
|
|
|
*/
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
switch_mm_irqs_off(NULL, &init_mm, NULL);
|
2017-05-22 22:30:02 +00:00
|
|
|
return;
|
|
|
|
}
|
2008-03-03 17:12:54 +00:00
|
|
|
|
2017-06-29 15:53:16 +00:00
|
|
|
if (unlikely(local_tlb_gen == mm_tlb_gen)) {
|
|
|
|
/*
|
|
|
|
* There's nothing to do: we're already up to date. This can
|
|
|
|
* happen if two concurrent flushes happen -- the first flush to
|
|
|
|
* be handled can catch us all the way up, leaving no work for
|
|
|
|
* the second flush.
|
|
|
|
*/
|
2021-02-20 23:17:05 +00:00
|
|
|
goto done;
|
2017-06-29 15:53:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
|
|
|
|
WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we get to this point, we know that our TLB is out of date.
|
|
|
|
* This does not strictly imply that we need to flush (it's
|
|
|
|
* possible that f->new_tlb_gen <= local_tlb_gen), but we're
|
|
|
|
* going to need to flush in the very near future, so we might
|
|
|
|
* as well get it over with.
|
|
|
|
*
|
|
|
|
* The only question is whether to do a full or partial flush.
|
|
|
|
*
|
|
|
|
* We do a partial flush if requested and two extra conditions
|
|
|
|
* are met:
|
|
|
|
*
|
|
|
|
* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
|
|
|
|
* we've always done all needed flushes to catch up to
|
|
|
|
* local_tlb_gen. If, for example, local_tlb_gen == 2 and
|
|
|
|
* f->new_tlb_gen == 3, then we know that the flush needed to bring
|
|
|
|
* us up to date for tlb_gen 3 is the partial flush we're
|
|
|
|
* processing.
|
|
|
|
*
|
|
|
|
* As an example of why this check is needed, suppose that there
|
|
|
|
* are two concurrent flushes. The first is a full flush that
|
|
|
|
* changes context.tlb_gen from 1 to 2. The second is a partial
|
|
|
|
* flush that changes context.tlb_gen from 2 to 3. If they get
|
|
|
|
* processed on this CPU in reverse order, we'll see
|
|
|
|
* local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
|
2018-01-31 16:03:10 +00:00
|
|
|
* If we were to use __flush_tlb_one_user() and set local_tlb_gen to
|
2017-06-29 15:53:16 +00:00
|
|
|
* 3, we'd be break the invariant: we'd update local_tlb_gen above
|
|
|
|
* 1 without the full flush that's needed for tlb_gen 2.
|
|
|
|
*
|
2021-03-18 14:28:01 +00:00
|
|
|
* 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization.
|
2017-06-29 15:53:16 +00:00
|
|
|
* Partial TLB flushes are not all that much cheaper than full TLB
|
|
|
|
* flushes, so it seems unlikely that it would be a performance win
|
|
|
|
* to do a partial flush if that won't bring our TLB fully up to
|
|
|
|
* date. By doing a full flush instead, we can increase
|
|
|
|
* local_tlb_gen all the way to mm_tlb_gen and we can probably
|
|
|
|
* avoid another flush in the very near future.
|
|
|
|
*/
|
|
|
|
if (f->end != TLB_FLUSH_ALL &&
|
|
|
|
f->new_tlb_gen == local_tlb_gen + 1 &&
|
|
|
|
f->new_tlb_gen == mm_tlb_gen) {
|
|
|
|
/* Partial flush */
|
2018-08-26 10:56:48 +00:00
|
|
|
unsigned long addr = f->start;
|
2017-06-29 15:53:16 +00:00
|
|
|
|
2021-02-20 23:17:05 +00:00
|
|
|
nr_invalidate = (f->end - f->start) >> f->stride_shift;
|
|
|
|
|
2017-05-28 17:00:10 +00:00
|
|
|
while (addr < f->end) {
|
2020-04-21 09:20:34 +00:00
|
|
|
flush_tlb_one_user(addr);
|
2018-08-26 10:56:48 +00:00
|
|
|
addr += 1UL << f->stride_shift;
|
2017-05-22 22:30:02 +00:00
|
|
|
}
|
2017-05-28 17:00:12 +00:00
|
|
|
if (local)
|
2018-08-26 10:56:48 +00:00
|
|
|
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
|
2017-06-29 15:53:16 +00:00
|
|
|
} else {
|
|
|
|
/* Full flush. */
|
2021-02-20 23:17:05 +00:00
|
|
|
nr_invalidate = TLB_FLUSH_ALL;
|
|
|
|
|
2020-04-21 09:20:32 +00:00
|
|
|
flush_tlb_local();
|
2017-06-29 15:53:16 +00:00
|
|
|
if (local)
|
|
|
|
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
2017-05-22 22:30:02 +00:00
|
|
|
}
|
2017-06-29 15:53:16 +00:00
|
|
|
|
|
|
|
/* Both paths above update our state to mm_tlb_gen. */
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
|
2017-05-28 17:00:12 +00:00
|
|
|
|
2021-02-20 23:17:05 +00:00
|
|
|
/* Tracing is done in a unified manner to reduce the code size */
|
|
|
|
done:
|
|
|
|
trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN :
|
|
|
|
(f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN :
|
|
|
|
TLB_LOCAL_MM_SHOOTDOWN,
|
|
|
|
nr_invalidate);
|
2017-05-28 17:00:12 +00:00
|
|
|
}
|
|
|
|
|
2021-02-20 23:17:06 +00:00
|
|
|
static bool tlb_is_not_lazy(int cpu)
|
2017-05-28 17:00:12 +00:00
|
|
|
{
|
2021-02-20 23:17:08 +00:00
|
|
|
return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
|
2017-05-28 17:00:12 +00:00
|
|
|
}
|
|
|
|
|
2021-02-20 23:17:06 +00:00
|
|
|
static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
|
|
|
|
|
2021-02-20 23:17:08 +00:00
|
|
|
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
|
|
|
|
EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared);
|
2018-09-26 03:58:44 +00:00
|
|
|
|
2021-02-20 23:17:07 +00:00
|
|
|
STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
|
2020-04-21 09:20:36 +00:00
|
|
|
const struct flush_tlb_info *info)
|
2009-01-11 05:58:09 +00:00
|
|
|
{
|
2021-02-20 23:17:07 +00:00
|
|
|
/*
|
|
|
|
* Do accounting and tracing. Note that there are (and have always been)
|
|
|
|
* cases in which a remote TLB flush will be traced, but eventually
|
|
|
|
* would not happen.
|
|
|
|
*/
|
2014-01-21 22:33:16 +00:00
|
|
|
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
2017-05-28 17:00:10 +00:00
|
|
|
if (info->end == TLB_FLUSH_ALL)
|
2016-04-01 21:31:23 +00:00
|
|
|
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
|
|
|
|
else
|
|
|
|
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
|
2017-05-28 17:00:10 +00:00
|
|
|
(info->end - info->start) >> PAGE_SHIFT);
|
2016-04-01 21:31:23 +00:00
|
|
|
|
2018-09-26 03:58:44 +00:00
|
|
|
/*
|
|
|
|
* If no page tables were freed, we can skip sending IPIs to
|
|
|
|
* CPUs in lazy TLB mode. They will flush the CPU themselves
|
|
|
|
* at the next context switch.
|
|
|
|
*
|
|
|
|
* However, if page tables are getting freed, we need to send the
|
|
|
|
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
|
|
|
|
* up on the new contents of what used to be page tables, while
|
|
|
|
* doing a speculative memory access.
|
|
|
|
*/
|
2021-02-20 23:17:06 +00:00
|
|
|
if (info->freed_tables) {
|
2021-02-20 23:17:07 +00:00
|
|
|
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
|
2021-02-20 23:17:06 +00:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Although we could have used on_each_cpu_cond_mask(),
|
|
|
|
* open-coding it has performance advantages, as it eliminates
|
|
|
|
* the need for indirect calls or retpolines. In addition, it
|
|
|
|
* allows to use a designated cpumask for evaluating the
|
|
|
|
* condition, instead of allocating one.
|
|
|
|
*
|
|
|
|
* This code works under the assumption that there are no nested
|
|
|
|
* TLB flushes, an assumption that is already made in
|
|
|
|
* flush_tlb_mm_range().
|
|
|
|
*
|
|
|
|
* cond_cpumask is logically a stack-local variable, but it is
|
|
|
|
* more efficient to have it off the stack and not to allocate
|
|
|
|
* it on demand. Preemption is disabled and this code is
|
|
|
|
* non-reentrant.
|
|
|
|
*/
|
|
|
|
struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask);
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
cpumask_clear(cond_cpumask);
|
|
|
|
|
|
|
|
for_each_cpu(cpu, cpumask) {
|
|
|
|
if (tlb_is_not_lazy(cpu))
|
|
|
|
__cpumask_set_cpu(cpu, cond_cpumask);
|
|
|
|
}
|
2021-02-20 23:17:07 +00:00
|
|
|
on_each_cpu_mask(cond_cpumask, flush_tlb_func, (void *)info, true);
|
2021-02-20 23:17:06 +00:00
|
|
|
}
|
2008-03-03 17:12:54 +00:00
|
|
|
}
|
|
|
|
|
2021-02-20 23:17:07 +00:00
|
|
|
void flush_tlb_multi(const struct cpumask *cpumask,
|
2020-04-21 09:20:36 +00:00
|
|
|
const struct flush_tlb_info *info)
|
|
|
|
{
|
2021-02-20 23:17:07 +00:00
|
|
|
__flush_tlb_multi(cpumask, info);
|
2020-04-21 09:20:36 +00:00
|
|
|
}
|
|
|
|
|
x86/mm: Set TLB flush tunable to sane value (33)
This has been run through Intel's LKP tests across a wide range
of modern sytems and workloads and it wasn't shown to make a
measurable performance difference positive or negative.
Now that we have some shiny new tracepoints, we can actually
figure out what the heck is going on.
During a kernel compile, 60% of the flush_tlb_mm_range() calls
are for a single page. It breaks down like this:
size percent percent<=
V V V
GLOBAL: 2.20% 2.20% avg cycles: 2283
1: 56.92% 59.12% avg cycles: 1276
2: 13.78% 72.90% avg cycles: 1505
3: 8.26% 81.16% avg cycles: 1880
4: 7.41% 88.58% avg cycles: 2447
5: 1.73% 90.31% avg cycles: 2358
6: 1.32% 91.63% avg cycles: 2563
7: 1.14% 92.77% avg cycles: 2862
8: 0.62% 93.39% avg cycles: 3542
9: 0.08% 93.47% avg cycles: 3289
10: 0.43% 93.90% avg cycles: 3570
11: 0.20% 94.10% avg cycles: 3767
12: 0.08% 94.18% avg cycles: 3996
13: 0.03% 94.20% avg cycles: 4077
14: 0.02% 94.23% avg cycles: 4836
15: 0.04% 94.26% avg cycles: 5699
16: 0.06% 94.32% avg cycles: 5041
17: 0.57% 94.89% avg cycles: 5473
18: 0.02% 94.91% avg cycles: 5396
19: 0.03% 94.95% avg cycles: 5296
20: 0.02% 94.96% avg cycles: 6749
21: 0.18% 95.14% avg cycles: 6225
22: 0.01% 95.15% avg cycles: 6393
23: 0.01% 95.16% avg cycles: 6861
24: 0.12% 95.28% avg cycles: 6912
25: 0.05% 95.32% avg cycles: 7190
26: 0.01% 95.33% avg cycles: 7793
27: 0.01% 95.34% avg cycles: 7833
28: 0.01% 95.35% avg cycles: 8253
29: 0.08% 95.42% avg cycles: 8024
30: 0.03% 95.45% avg cycles: 9670
31: 0.01% 95.46% avg cycles: 8949
32: 0.01% 95.46% avg cycles: 9350
33: 3.11% 98.57% avg cycles: 8534
34: 0.02% 98.60% avg cycles: 10977
35: 0.02% 98.62% avg cycles: 11400
We get in to dimishing returns pretty quickly. On pre-IvyBridge
CPUs, we used to set the limit at 8 pages, and it was set at 128
on IvyBrige. That 128 number looks pretty silly considering that
less than 0.5% of the flushes are that large.
The previous code tried to size this number based on the size of
the TLB. Good idea, but it's error-prone, needs maintenance
(which it didn't get up to now), and probably would not matter in
practice much.
Settting it to 33 means that we cover the mallopt
M_TRIM_THRESHOLD, which is the most universally common size to do
flushes.
That's the short version. Here's the long one for why I chose 33:
1. These numbers have a constant bias in the timestamps from the
tracing. Probably counts for a couple hundred cycles in each of
these tests, but it should be fairly _even_ across all of them.
The smallest delta between the tracepoints I have ever seen is
335 cycles. This is one reason the cycles/page cost goes down in
general as the flushes get larger. The true cost is nearer to
100 cycles.
2. A full flush is more expensive than a single invlpg, but not
by much (single percentages).
3. A dtlb miss is 17.1ns (~45 cycles) and a itlb miss is 13.0ns
(~34 cycles). At those rates, refilling the 512-entry dTLB takes
22,000 cycles.
4. 22,000 cycles is approximately the equivalent of doing 85
invlpg operations. But, the odds are that the TLB can
actually be filled up faster than that because TLB misses that
are close in time also tend to leverage the same caches.
6. ~98% of flushes are <=33 pages. There are a lot of flushes of
33 pages, probably because libc's M_TRIM_THRESHOLD is set to
128k (32 pages)
7. I've found no consistent data to support changing the IvyBridge
vs. SandyBridge tunable by a factor of 16
I used the performance counters on this hardware (IvyBridge i5-3320M)
to figure out the tlb miss costs:
ocperf.py stat -e dtlb_load_misses.walk_duration,dtlb_load_misses.walk_completed,dtlb_store_misses.walk_duration,dtlb_store_misses.walk_completed,itlb_misses.walk_duration,itlb_misses.walk_completed,itlb.itlb_flush
7,720,030,970 dtlb_load_misses_walk_duration [57.13%]
169,856,353 dtlb_load_misses_walk_completed [57.15%]
708,832,859 dtlb_store_misses_walk_duration [57.17%]
19,346,823 dtlb_store_misses_walk_completed [57.17%]
2,779,687,402 itlb_misses_walk_duration [57.15%]
82,241,148 itlb_misses_walk_completed [57.13%]
770,717 itlb_itlb_flush [57.11%]
Show that a dtlb miss is 17.1ns (~45 cycles) and a itlb miss is 13.0ns
(~34 cycles). At those rates, refilling the 512-entry dTLB takes
22,000 cycles. On a SandyBridge system with more cores and larger
caches, those are dtlb=13.4ns and itlb=9.5ns.
cat perf.stat.txt | perl -pe 's/,//g'
| awk '/itlb_misses_walk_duration/ { icyc+=$1 }
/itlb_misses_walk_completed/ { imiss+=$1 }
/dtlb_.*_walk_duration/ { dcyc+=$1 }
/dtlb_.*.*completed/ { dmiss+=$1 }
END {print "itlb cyc/miss: ", icyc/imiss, " dtlb cyc/miss: ", dcyc/dmiss, " ----- ", icyc,imiss, dcyc,dmiss }
On Westmere CPUs, the counters to use are: itlb_flush,itlb_misses.walk_cycles,itlb_misses.any,dtlb_misses.walk_cycles,dtlb_misses.any
The assumptions that this code went in under:
https://lkml.org/lkml/2012/6/12/119 say that a flush and a refill are
about 100ns. Being generous, that is over by a factor of 6 on the
refill side, although it is fairly close on the cost of an invlpg.
An increase of a single invlpg operation seems to lengthen the flush
range operation by about 200 cycles. Here is one example of the data
collected for flushing 10 and 11 pages (full data are below):
10: 0.43% 93.90% avg cycles: 3570 cycles/page: 357 samples: 4714
11: 0.20% 94.10% avg cycles: 3767 cycles/page: 342 samples: 2145
How to generate this table:
echo 10000 > /sys/kernel/debug/tracing/buffer_size_kb
echo x86-tsc > /sys/kernel/debug/tracing/trace_clock
echo 'reason != 0' > /sys/kernel/debug/tracing/events/tlb/tlb_flush/filter
echo 1 > /sys/kernel/debug/tracing/events/tlb/tlb_flush/enable
Pipe the trace output in to this script:
http://sr71.net/~dave/intel/201402-tlb/trace-time-diff-process.pl.txt
Note that these data were gathered with the invlpg threshold set to
150 pages. Only data points with >=50 of samples were printed:
Flush % of %<=
in flush this
pages es size
------------------------------------------------------------------------------
-1: 2.20% 2.20% avg cycles: 2283 cycles/page: xxxx samples: 23960
1: 56.92% 59.12% avg cycles: 1276 cycles/page: 1276 samples: 620895
2: 13.78% 72.90% avg cycles: 1505 cycles/page: 752 samples: 150335
3: 8.26% 81.16% avg cycles: 1880 cycles/page: 626 samples: 90131
4: 7.41% 88.58% avg cycles: 2447 cycles/page: 611 samples: 80877
5: 1.73% 90.31% avg cycles: 2358 cycles/page: 471 samples: 18885
6: 1.32% 91.63% avg cycles: 2563 cycles/page: 427 samples: 14397
7: 1.14% 92.77% avg cycles: 2862 cycles/page: 408 samples: 12441
8: 0.62% 93.39% avg cycles: 3542 cycles/page: 442 samples: 6721
9: 0.08% 93.47% avg cycles: 3289 cycles/page: 365 samples: 917
10: 0.43% 93.90% avg cycles: 3570 cycles/page: 357 samples: 4714
11: 0.20% 94.10% avg cycles: 3767 cycles/page: 342 samples: 2145
12: 0.08% 94.18% avg cycles: 3996 cycles/page: 333 samples: 864
13: 0.03% 94.20% avg cycles: 4077 cycles/page: 313 samples: 289
14: 0.02% 94.23% avg cycles: 4836 cycles/page: 345 samples: 236
15: 0.04% 94.26% avg cycles: 5699 cycles/page: 379 samples: 390
16: 0.06% 94.32% avg cycles: 5041 cycles/page: 315 samples: 643
17: 0.57% 94.89% avg cycles: 5473 cycles/page: 321 samples: 6229
18: 0.02% 94.91% avg cycles: 5396 cycles/page: 299 samples: 224
19: 0.03% 94.95% avg cycles: 5296 cycles/page: 278 samples: 367
20: 0.02% 94.96% avg cycles: 6749 cycles/page: 337 samples: 185
21: 0.18% 95.14% avg cycles: 6225 cycles/page: 296 samples: 1964
22: 0.01% 95.15% avg cycles: 6393 cycles/page: 290 samples: 83
23: 0.01% 95.16% avg cycles: 6861 cycles/page: 298 samples: 61
24: 0.12% 95.28% avg cycles: 6912 cycles/page: 288 samples: 1307
25: 0.05% 95.32% avg cycles: 7190 cycles/page: 287 samples: 533
26: 0.01% 95.33% avg cycles: 7793 cycles/page: 299 samples: 94
27: 0.01% 95.34% avg cycles: 7833 cycles/page: 290 samples: 66
28: 0.01% 95.35% avg cycles: 8253 cycles/page: 294 samples: 73
29: 0.08% 95.42% avg cycles: 8024 cycles/page: 276 samples: 846
30: 0.03% 95.45% avg cycles: 9670 cycles/page: 322 samples: 296
31: 0.01% 95.46% avg cycles: 8949 cycles/page: 288 samples: 79
32: 0.01% 95.46% avg cycles: 9350 cycles/page: 292 samples: 60
33: 3.11% 98.57% avg cycles: 8534 cycles/page: 258 samples: 33936
34: 0.02% 98.60% avg cycles: 10977 cycles/page: 322 samples: 268
35: 0.02% 98.62% avg cycles: 11400 cycles/page: 325 samples: 177
36: 0.01% 98.63% avg cycles: 11504 cycles/page: 319 samples: 161
37: 0.02% 98.65% avg cycles: 11596 cycles/page: 313 samples: 182
38: 0.02% 98.66% avg cycles: 11850 cycles/page: 311 samples: 195
39: 0.01% 98.68% avg cycles: 12158 cycles/page: 311 samples: 128
40: 0.01% 98.68% avg cycles: 11626 cycles/page: 290 samples: 78
41: 0.04% 98.73% avg cycles: 11435 cycles/page: 278 samples: 477
42: 0.01% 98.73% avg cycles: 12571 cycles/page: 299 samples: 74
43: 0.01% 98.74% avg cycles: 12562 cycles/page: 292 samples: 78
44: 0.01% 98.75% avg cycles: 12991 cycles/page: 295 samples: 108
45: 0.01% 98.76% avg cycles: 13169 cycles/page: 292 samples: 78
46: 0.02% 98.78% avg cycles: 12891 cycles/page: 280 samples: 261
47: 0.01% 98.79% avg cycles: 13099 cycles/page: 278 samples: 67
48: 0.01% 98.80% avg cycles: 13851 cycles/page: 288 samples: 77
49: 0.01% 98.80% avg cycles: 13749 cycles/page: 280 samples: 66
50: 0.01% 98.81% avg cycles: 13949 cycles/page: 278 samples: 73
52: 0.00% 98.82% avg cycles: 14243 cycles/page: 273 samples: 52
54: 0.01% 98.83% avg cycles: 15312 cycles/page: 283 samples: 87
55: 0.01% 98.84% avg cycles: 15197 cycles/page: 276 samples: 109
56: 0.02% 98.86% avg cycles: 15234 cycles/page: 272 samples: 208
57: 0.00% 98.86% avg cycles: 14888 cycles/page: 261 samples: 53
58: 0.01% 98.87% avg cycles: 15037 cycles/page: 259 samples: 59
59: 0.01% 98.87% avg cycles: 15752 cycles/page: 266 samples: 63
62: 0.00% 98.89% avg cycles: 16222 cycles/page: 261 samples: 54
64: 0.02% 98.91% avg cycles: 17179 cycles/page: 268 samples: 248
65: 0.12% 99.03% avg cycles: 18762 cycles/page: 288 samples: 1324
85: 0.00% 99.10% avg cycles: 21649 cycles/page: 254 samples: 50
127: 0.01% 99.18% avg cycles: 32397 cycles/page: 255 samples: 75
128: 0.13% 99.31% avg cycles: 31711 cycles/page: 247 samples: 1466
129: 0.18% 99.49% avg cycles: 33017 cycles/page: 255 samples: 1927
181: 0.33% 99.84% avg cycles: 2489 cycles/page: 13 samples: 3547
256: 0.05% 99.91% avg cycles: 2305 cycles/page: 9 samples: 550
512: 0.03% 99.95% avg cycles: 2133 cycles/page: 4 samples: 304
1512: 0.01% 99.99% avg cycles: 3038 cycles/page: 2 samples: 65
Here are the tlb counters during a 10-second slice of a kernel compile
for a SandyBridge system. It's better than IvyBridge, but probably
due to the larger caches since this was one of the 'X' extreme parts.
10,873,007,282 dtlb_load_misses_walk_duration
250,711,333 dtlb_load_misses_walk_completed
1,212,395,865 dtlb_store_misses_walk_duration
31,615,772 dtlb_store_misses_walk_completed
5,091,010,274 itlb_misses_walk_duration
163,193,511 itlb_misses_walk_completed
1,321,980 itlb_itlb_flush
10.008045158 seconds time elapsed
# cat perf.stat.1392743721.txt | perl -pe 's/,//g' | awk '/itlb_misses_walk_duration/ { icyc+=$1 } /itlb_misses_walk_completed/ { imiss+=$1 } /dtlb_.*_walk_duration/ { dcyc+=$1 } /dtlb_.*.*completed/ { dmiss+=$1 } END {print "itlb cyc/miss: ", icyc/imiss/3.3, " dtlb cyc/miss: ", dcyc/dmiss/3.3, " ----- ", icyc,imiss, dcyc,dmiss }'
itlb ns/miss: 9.45338 dtlb ns/miss: 12.9716
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154103.10C1115E@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2014-07-31 15:41:03 +00:00
|
|
|
/*
|
2019-06-07 18:54:32 +00:00
|
|
|
* See Documentation/x86/tlb.rst for details. We choose 33
|
x86/mm: Set TLB flush tunable to sane value (33)
This has been run through Intel's LKP tests across a wide range
of modern sytems and workloads and it wasn't shown to make a
measurable performance difference positive or negative.
Now that we have some shiny new tracepoints, we can actually
figure out what the heck is going on.
During a kernel compile, 60% of the flush_tlb_mm_range() calls
are for a single page. It breaks down like this:
size percent percent<=
V V V
GLOBAL: 2.20% 2.20% avg cycles: 2283
1: 56.92% 59.12% avg cycles: 1276
2: 13.78% 72.90% avg cycles: 1505
3: 8.26% 81.16% avg cycles: 1880
4: 7.41% 88.58% avg cycles: 2447
5: 1.73% 90.31% avg cycles: 2358
6: 1.32% 91.63% avg cycles: 2563
7: 1.14% 92.77% avg cycles: 2862
8: 0.62% 93.39% avg cycles: 3542
9: 0.08% 93.47% avg cycles: 3289
10: 0.43% 93.90% avg cycles: 3570
11: 0.20% 94.10% avg cycles: 3767
12: 0.08% 94.18% avg cycles: 3996
13: 0.03% 94.20% avg cycles: 4077
14: 0.02% 94.23% avg cycles: 4836
15: 0.04% 94.26% avg cycles: 5699
16: 0.06% 94.32% avg cycles: 5041
17: 0.57% 94.89% avg cycles: 5473
18: 0.02% 94.91% avg cycles: 5396
19: 0.03% 94.95% avg cycles: 5296
20: 0.02% 94.96% avg cycles: 6749
21: 0.18% 95.14% avg cycles: 6225
22: 0.01% 95.15% avg cycles: 6393
23: 0.01% 95.16% avg cycles: 6861
24: 0.12% 95.28% avg cycles: 6912
25: 0.05% 95.32% avg cycles: 7190
26: 0.01% 95.33% avg cycles: 7793
27: 0.01% 95.34% avg cycles: 7833
28: 0.01% 95.35% avg cycles: 8253
29: 0.08% 95.42% avg cycles: 8024
30: 0.03% 95.45% avg cycles: 9670
31: 0.01% 95.46% avg cycles: 8949
32: 0.01% 95.46% avg cycles: 9350
33: 3.11% 98.57% avg cycles: 8534
34: 0.02% 98.60% avg cycles: 10977
35: 0.02% 98.62% avg cycles: 11400
We get in to dimishing returns pretty quickly. On pre-IvyBridge
CPUs, we used to set the limit at 8 pages, and it was set at 128
on IvyBrige. That 128 number looks pretty silly considering that
less than 0.5% of the flushes are that large.
The previous code tried to size this number based on the size of
the TLB. Good idea, but it's error-prone, needs maintenance
(which it didn't get up to now), and probably would not matter in
practice much.
Settting it to 33 means that we cover the mallopt
M_TRIM_THRESHOLD, which is the most universally common size to do
flushes.
That's the short version. Here's the long one for why I chose 33:
1. These numbers have a constant bias in the timestamps from the
tracing. Probably counts for a couple hundred cycles in each of
these tests, but it should be fairly _even_ across all of them.
The smallest delta between the tracepoints I have ever seen is
335 cycles. This is one reason the cycles/page cost goes down in
general as the flushes get larger. The true cost is nearer to
100 cycles.
2. A full flush is more expensive than a single invlpg, but not
by much (single percentages).
3. A dtlb miss is 17.1ns (~45 cycles) and a itlb miss is 13.0ns
(~34 cycles). At those rates, refilling the 512-entry dTLB takes
22,000 cycles.
4. 22,000 cycles is approximately the equivalent of doing 85
invlpg operations. But, the odds are that the TLB can
actually be filled up faster than that because TLB misses that
are close in time also tend to leverage the same caches.
6. ~98% of flushes are <=33 pages. There are a lot of flushes of
33 pages, probably because libc's M_TRIM_THRESHOLD is set to
128k (32 pages)
7. I've found no consistent data to support changing the IvyBridge
vs. SandyBridge tunable by a factor of 16
I used the performance counters on this hardware (IvyBridge i5-3320M)
to figure out the tlb miss costs:
ocperf.py stat -e dtlb_load_misses.walk_duration,dtlb_load_misses.walk_completed,dtlb_store_misses.walk_duration,dtlb_store_misses.walk_completed,itlb_misses.walk_duration,itlb_misses.walk_completed,itlb.itlb_flush
7,720,030,970 dtlb_load_misses_walk_duration [57.13%]
169,856,353 dtlb_load_misses_walk_completed [57.15%]
708,832,859 dtlb_store_misses_walk_duration [57.17%]
19,346,823 dtlb_store_misses_walk_completed [57.17%]
2,779,687,402 itlb_misses_walk_duration [57.15%]
82,241,148 itlb_misses_walk_completed [57.13%]
770,717 itlb_itlb_flush [57.11%]
Show that a dtlb miss is 17.1ns (~45 cycles) and a itlb miss is 13.0ns
(~34 cycles). At those rates, refilling the 512-entry dTLB takes
22,000 cycles. On a SandyBridge system with more cores and larger
caches, those are dtlb=13.4ns and itlb=9.5ns.
cat perf.stat.txt | perl -pe 's/,//g'
| awk '/itlb_misses_walk_duration/ { icyc+=$1 }
/itlb_misses_walk_completed/ { imiss+=$1 }
/dtlb_.*_walk_duration/ { dcyc+=$1 }
/dtlb_.*.*completed/ { dmiss+=$1 }
END {print "itlb cyc/miss: ", icyc/imiss, " dtlb cyc/miss: ", dcyc/dmiss, " ----- ", icyc,imiss, dcyc,dmiss }
On Westmere CPUs, the counters to use are: itlb_flush,itlb_misses.walk_cycles,itlb_misses.any,dtlb_misses.walk_cycles,dtlb_misses.any
The assumptions that this code went in under:
https://lkml.org/lkml/2012/6/12/119 say that a flush and a refill are
about 100ns. Being generous, that is over by a factor of 6 on the
refill side, although it is fairly close on the cost of an invlpg.
An increase of a single invlpg operation seems to lengthen the flush
range operation by about 200 cycles. Here is one example of the data
collected for flushing 10 and 11 pages (full data are below):
10: 0.43% 93.90% avg cycles: 3570 cycles/page: 357 samples: 4714
11: 0.20% 94.10% avg cycles: 3767 cycles/page: 342 samples: 2145
How to generate this table:
echo 10000 > /sys/kernel/debug/tracing/buffer_size_kb
echo x86-tsc > /sys/kernel/debug/tracing/trace_clock
echo 'reason != 0' > /sys/kernel/debug/tracing/events/tlb/tlb_flush/filter
echo 1 > /sys/kernel/debug/tracing/events/tlb/tlb_flush/enable
Pipe the trace output in to this script:
http://sr71.net/~dave/intel/201402-tlb/trace-time-diff-process.pl.txt
Note that these data were gathered with the invlpg threshold set to
150 pages. Only data points with >=50 of samples were printed:
Flush % of %<=
in flush this
pages es size
------------------------------------------------------------------------------
-1: 2.20% 2.20% avg cycles: 2283 cycles/page: xxxx samples: 23960
1: 56.92% 59.12% avg cycles: 1276 cycles/page: 1276 samples: 620895
2: 13.78% 72.90% avg cycles: 1505 cycles/page: 752 samples: 150335
3: 8.26% 81.16% avg cycles: 1880 cycles/page: 626 samples: 90131
4: 7.41% 88.58% avg cycles: 2447 cycles/page: 611 samples: 80877
5: 1.73% 90.31% avg cycles: 2358 cycles/page: 471 samples: 18885
6: 1.32% 91.63% avg cycles: 2563 cycles/page: 427 samples: 14397
7: 1.14% 92.77% avg cycles: 2862 cycles/page: 408 samples: 12441
8: 0.62% 93.39% avg cycles: 3542 cycles/page: 442 samples: 6721
9: 0.08% 93.47% avg cycles: 3289 cycles/page: 365 samples: 917
10: 0.43% 93.90% avg cycles: 3570 cycles/page: 357 samples: 4714
11: 0.20% 94.10% avg cycles: 3767 cycles/page: 342 samples: 2145
12: 0.08% 94.18% avg cycles: 3996 cycles/page: 333 samples: 864
13: 0.03% 94.20% avg cycles: 4077 cycles/page: 313 samples: 289
14: 0.02% 94.23% avg cycles: 4836 cycles/page: 345 samples: 236
15: 0.04% 94.26% avg cycles: 5699 cycles/page: 379 samples: 390
16: 0.06% 94.32% avg cycles: 5041 cycles/page: 315 samples: 643
17: 0.57% 94.89% avg cycles: 5473 cycles/page: 321 samples: 6229
18: 0.02% 94.91% avg cycles: 5396 cycles/page: 299 samples: 224
19: 0.03% 94.95% avg cycles: 5296 cycles/page: 278 samples: 367
20: 0.02% 94.96% avg cycles: 6749 cycles/page: 337 samples: 185
21: 0.18% 95.14% avg cycles: 6225 cycles/page: 296 samples: 1964
22: 0.01% 95.15% avg cycles: 6393 cycles/page: 290 samples: 83
23: 0.01% 95.16% avg cycles: 6861 cycles/page: 298 samples: 61
24: 0.12% 95.28% avg cycles: 6912 cycles/page: 288 samples: 1307
25: 0.05% 95.32% avg cycles: 7190 cycles/page: 287 samples: 533
26: 0.01% 95.33% avg cycles: 7793 cycles/page: 299 samples: 94
27: 0.01% 95.34% avg cycles: 7833 cycles/page: 290 samples: 66
28: 0.01% 95.35% avg cycles: 8253 cycles/page: 294 samples: 73
29: 0.08% 95.42% avg cycles: 8024 cycles/page: 276 samples: 846
30: 0.03% 95.45% avg cycles: 9670 cycles/page: 322 samples: 296
31: 0.01% 95.46% avg cycles: 8949 cycles/page: 288 samples: 79
32: 0.01% 95.46% avg cycles: 9350 cycles/page: 292 samples: 60
33: 3.11% 98.57% avg cycles: 8534 cycles/page: 258 samples: 33936
34: 0.02% 98.60% avg cycles: 10977 cycles/page: 322 samples: 268
35: 0.02% 98.62% avg cycles: 11400 cycles/page: 325 samples: 177
36: 0.01% 98.63% avg cycles: 11504 cycles/page: 319 samples: 161
37: 0.02% 98.65% avg cycles: 11596 cycles/page: 313 samples: 182
38: 0.02% 98.66% avg cycles: 11850 cycles/page: 311 samples: 195
39: 0.01% 98.68% avg cycles: 12158 cycles/page: 311 samples: 128
40: 0.01% 98.68% avg cycles: 11626 cycles/page: 290 samples: 78
41: 0.04% 98.73% avg cycles: 11435 cycles/page: 278 samples: 477
42: 0.01% 98.73% avg cycles: 12571 cycles/page: 299 samples: 74
43: 0.01% 98.74% avg cycles: 12562 cycles/page: 292 samples: 78
44: 0.01% 98.75% avg cycles: 12991 cycles/page: 295 samples: 108
45: 0.01% 98.76% avg cycles: 13169 cycles/page: 292 samples: 78
46: 0.02% 98.78% avg cycles: 12891 cycles/page: 280 samples: 261
47: 0.01% 98.79% avg cycles: 13099 cycles/page: 278 samples: 67
48: 0.01% 98.80% avg cycles: 13851 cycles/page: 288 samples: 77
49: 0.01% 98.80% avg cycles: 13749 cycles/page: 280 samples: 66
50: 0.01% 98.81% avg cycles: 13949 cycles/page: 278 samples: 73
52: 0.00% 98.82% avg cycles: 14243 cycles/page: 273 samples: 52
54: 0.01% 98.83% avg cycles: 15312 cycles/page: 283 samples: 87
55: 0.01% 98.84% avg cycles: 15197 cycles/page: 276 samples: 109
56: 0.02% 98.86% avg cycles: 15234 cycles/page: 272 samples: 208
57: 0.00% 98.86% avg cycles: 14888 cycles/page: 261 samples: 53
58: 0.01% 98.87% avg cycles: 15037 cycles/page: 259 samples: 59
59: 0.01% 98.87% avg cycles: 15752 cycles/page: 266 samples: 63
62: 0.00% 98.89% avg cycles: 16222 cycles/page: 261 samples: 54
64: 0.02% 98.91% avg cycles: 17179 cycles/page: 268 samples: 248
65: 0.12% 99.03% avg cycles: 18762 cycles/page: 288 samples: 1324
85: 0.00% 99.10% avg cycles: 21649 cycles/page: 254 samples: 50
127: 0.01% 99.18% avg cycles: 32397 cycles/page: 255 samples: 75
128: 0.13% 99.31% avg cycles: 31711 cycles/page: 247 samples: 1466
129: 0.18% 99.49% avg cycles: 33017 cycles/page: 255 samples: 1927
181: 0.33% 99.84% avg cycles: 2489 cycles/page: 13 samples: 3547
256: 0.05% 99.91% avg cycles: 2305 cycles/page: 9 samples: 550
512: 0.03% 99.95% avg cycles: 2133 cycles/page: 4 samples: 304
1512: 0.01% 99.99% avg cycles: 3038 cycles/page: 2 samples: 65
Here are the tlb counters during a 10-second slice of a kernel compile
for a SandyBridge system. It's better than IvyBridge, but probably
due to the larger caches since this was one of the 'X' extreme parts.
10,873,007,282 dtlb_load_misses_walk_duration
250,711,333 dtlb_load_misses_walk_completed
1,212,395,865 dtlb_store_misses_walk_duration
31,615,772 dtlb_store_misses_walk_completed
5,091,010,274 itlb_misses_walk_duration
163,193,511 itlb_misses_walk_completed
1,321,980 itlb_itlb_flush
10.008045158 seconds time elapsed
# cat perf.stat.1392743721.txt | perl -pe 's/,//g' | awk '/itlb_misses_walk_duration/ { icyc+=$1 } /itlb_misses_walk_completed/ { imiss+=$1 } /dtlb_.*_walk_duration/ { dcyc+=$1 } /dtlb_.*.*completed/ { dmiss+=$1 } END {print "itlb cyc/miss: ", icyc/imiss/3.3, " dtlb cyc/miss: ", dcyc/dmiss/3.3, " ----- ", icyc,imiss, dcyc,dmiss }'
itlb ns/miss: 9.45338 dtlb ns/miss: 12.9716
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154103.10C1115E@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2014-07-31 15:41:03 +00:00
|
|
|
* because it is large enough to cover the vast majority (at
|
|
|
|
* least 95%) of allocations, and is small enough that we are
|
|
|
|
* confident it will not cause too much overhead. Each single
|
|
|
|
* flush is about 100 ns, so this caps the maximum overhead at
|
|
|
|
* _about_ 3,000 ns.
|
|
|
|
*
|
|
|
|
* This is in units of pages.
|
|
|
|
*/
|
2018-12-03 17:03:49 +00:00
|
|
|
unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
|
x86/mm: Rip out complicated, out-of-date, buggy TLB flushing
I think the flush_tlb_mm_range() code that tries to tune the
flush sizes based on the CPU needs to get ripped out for
several reasons:
1. It is obviously buggy. It uses mm->total_vm to judge the
task's footprint in the TLB. It should certainly be using
some measure of RSS, *NOT* ->total_vm since only resident
memory can populate the TLB.
2. Haswell, and several other CPUs are missing from the
intel_tlb_flushall_shift_set() function. Thus, it has been
demonstrated to bitrot quickly in practice.
3. It is plain wrong in my vm:
[ 0.037444] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] tlb_flushall_shift: 6
Which leads to it to never use invlpg.
4. The assumptions about TLB refill costs are wrong:
http://lkml.kernel.org/r/1337782555-8088-3-git-send-email-alex.shi@intel.com
(more on this in later patches)
5. I can not reproduce the original data: https://lkml.org/lkml/2012/5/17/59
I believe the sample times were too short. Running the
benchmark in a loop yields times that vary quite a bit.
Note that this leaves us with a static ceiling of 1 page. This
is a conservative, dumb setting, and will be revised in a later
patch.
This also removes the code which attempts to predict whether we
are flushing data or instructions. We expect instruction flushes
to be relatively rare and not worth tuning for explicitly.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154055.ABC88E89@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2014-07-31 15:40:55 +00:00
|
|
|
|
2019-04-25 23:01:43 +00:00
|
|
|
static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
|
|
|
|
#endif
|
|
|
|
|
2021-02-20 23:17:11 +00:00
|
|
|
static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
|
2019-04-25 23:01:43 +00:00
|
|
|
unsigned long start, unsigned long end,
|
|
|
|
unsigned int stride_shift, bool freed_tables,
|
|
|
|
u64 new_tlb_gen)
|
|
|
|
{
|
|
|
|
struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
/*
|
|
|
|
* Ensure that the following code is non-reentrant and flush_tlb_info
|
|
|
|
* is not overwritten. This means no TLB flushing is initiated by
|
|
|
|
* interrupt handlers and machine-check exception handlers.
|
|
|
|
*/
|
|
|
|
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
info->start = start;
|
|
|
|
info->end = end;
|
|
|
|
info->mm = mm;
|
|
|
|
info->stride_shift = stride_shift;
|
|
|
|
info->freed_tables = freed_tables;
|
|
|
|
info->new_tlb_gen = new_tlb_gen;
|
2021-02-20 23:17:05 +00:00
|
|
|
info->initiating_cpu = smp_processor_id();
|
2019-04-25 23:01:43 +00:00
|
|
|
|
|
|
|
return info;
|
|
|
|
}
|
|
|
|
|
2021-02-20 23:17:11 +00:00
|
|
|
static void put_flush_tlb_info(void)
|
2019-04-25 23:01:43 +00:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
2021-03-18 14:28:01 +00:00
|
|
|
/* Complete reentrancy prevention checks */
|
2019-04-25 23:01:43 +00:00
|
|
|
barrier();
|
|
|
|
this_cpu_dec(flush_tlb_info_idx);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
x86/tlb: enable tlb flush range support for x86
Not every tlb_flush execution moment is really need to evacuate all
TLB entries, like in munmap, just few 'invlpg' is better for whole
process performance, since it leaves most of TLB entries for later
accessing.
This patch also rewrite flush_tlb_range for 2 purposes:
1, split it out to get flush_blt_mm_range function.
2, clean up to reduce line breaking, thanks for Borislav's input.
My micro benchmark 'mummap' http://lkml.org/lkml/2012/5/17/59
show that the random memory access on other CPU has 0~50% speed up
on a 2P * 4cores * HT NHM EP while do 'munmap'.
Thanks Yongjie's testing on this patch:
-------------
I used Linux 3.4-RC6 w/ and w/o his patches as Xen dom0 and guest
kernel.
After running two benchmarks in Xen HVM guest, I found his patches
brought about 1%~3% performance gain in 'kernel build' and 'netperf'
testing, though the performance gain was not very stable in 'kernel
build' testing.
Some detailed testing results are below.
Testing Environment:
Hardware: Romley-EP platform
Xen version: latest upstream
Linux kernel: 3.4-RC6
Guest vCPU number: 8
NIC: Intel 82599 (10GB bandwidth)
In 'kernel build' testing in guest:
Command line | performance gain
make -j 4 | 3.81%
make -j 8 | 0.37%
make -j 16 | -0.52%
In 'netperf' testing, we tested TCP_STREAM with default socket size
16384 byte as large packet and 64 byte as small packet.
I used several clients to add networking pressure, then 'netperf' server
automatically generated several threads to response them.
I also used large-size packet and small-size packet in the testing.
Packet size | Thread number | performance gain
16384 bytes | 4 | 0.02%
16384 bytes | 8 | 2.21%
16384 bytes | 16 | 2.04%
64 bytes | 4 | 1.07%
64 bytes | 8 | 3.31%
64 bytes | 16 | 0.71%
Signed-off-by: Alex Shi <alex.shi@intel.com>
Link: http://lkml.kernel.org/r/1340845344-27557-8-git-send-email-alex.shi@intel.com
Tested-by: Ren, Yongjie <yongjie.ren@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2012-06-28 01:02:22 +00:00
|
|
|
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
2018-09-26 03:58:42 +00:00
|
|
|
unsigned long end, unsigned int stride_shift,
|
|
|
|
bool freed_tables)
|
x86/tlb: enable tlb flush range support for x86
Not every tlb_flush execution moment is really need to evacuate all
TLB entries, like in munmap, just few 'invlpg' is better for whole
process performance, since it leaves most of TLB entries for later
accessing.
This patch also rewrite flush_tlb_range for 2 purposes:
1, split it out to get flush_blt_mm_range function.
2, clean up to reduce line breaking, thanks for Borislav's input.
My micro benchmark 'mummap' http://lkml.org/lkml/2012/5/17/59
show that the random memory access on other CPU has 0~50% speed up
on a 2P * 4cores * HT NHM EP while do 'munmap'.
Thanks Yongjie's testing on this patch:
-------------
I used Linux 3.4-RC6 w/ and w/o his patches as Xen dom0 and guest
kernel.
After running two benchmarks in Xen HVM guest, I found his patches
brought about 1%~3% performance gain in 'kernel build' and 'netperf'
testing, though the performance gain was not very stable in 'kernel
build' testing.
Some detailed testing results are below.
Testing Environment:
Hardware: Romley-EP platform
Xen version: latest upstream
Linux kernel: 3.4-RC6
Guest vCPU number: 8
NIC: Intel 82599 (10GB bandwidth)
In 'kernel build' testing in guest:
Command line | performance gain
make -j 4 | 3.81%
make -j 8 | 0.37%
make -j 16 | -0.52%
In 'netperf' testing, we tested TCP_STREAM with default socket size
16384 byte as large packet and 64 byte as small packet.
I used several clients to add networking pressure, then 'netperf' server
automatically generated several threads to response them.
I also used large-size packet and small-size packet in the testing.
Packet size | Thread number | performance gain
16384 bytes | 4 | 0.02%
16384 bytes | 8 | 2.21%
16384 bytes | 16 | 2.04%
64 bytes | 4 | 1.07%
64 bytes | 8 | 3.31%
64 bytes | 16 | 0.71%
Signed-off-by: Alex Shi <alex.shi@intel.com>
Link: http://lkml.kernel.org/r/1340845344-27557-8-git-send-email-alex.shi@intel.com
Tested-by: Ren, Yongjie <yongjie.ren@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2012-06-28 01:02:22 +00:00
|
|
|
{
|
2019-04-25 23:01:43 +00:00
|
|
|
struct flush_tlb_info *info;
|
|
|
|
u64 new_tlb_gen;
|
2017-05-28 17:00:12 +00:00
|
|
|
int cpu;
|
2017-04-22 07:01:21 +00:00
|
|
|
|
2017-05-28 17:00:12 +00:00
|
|
|
cpu = get_cpu();
|
2016-01-06 20:21:01 +00:00
|
|
|
|
2017-05-28 17:00:12 +00:00
|
|
|
/* Should we flush just the requested range? */
|
2019-04-25 23:01:43 +00:00
|
|
|
if ((end == TLB_FLUSH_ALL) ||
|
|
|
|
((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
|
|
|
|
start = 0;
|
|
|
|
end = TLB_FLUSH_ALL;
|
2014-07-31 15:40:54 +00:00
|
|
|
}
|
2017-05-28 17:00:12 +00:00
|
|
|
|
2019-04-25 23:01:43 +00:00
|
|
|
/* This is also a barrier that synchronizes with switch_mm(). */
|
|
|
|
new_tlb_gen = inc_mm_tlb_gen(mm);
|
|
|
|
|
|
|
|
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
|
|
|
|
new_tlb_gen);
|
|
|
|
|
2021-02-20 23:17:07 +00:00
|
|
|
/*
|
|
|
|
* flush_tlb_multi() is not optimized for the common case in which only
|
|
|
|
* a local TLB flush is needed. Optimize this use-case by calling
|
|
|
|
* flush_tlb_func_local() directly in this case.
|
|
|
|
*/
|
|
|
|
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
|
|
|
|
flush_tlb_multi(mm_cpumask(mm), info);
|
|
|
|
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
|
2019-04-25 23:01:43 +00:00
|
|
|
lockdep_assert_irqs_enabled();
|
2017-06-29 15:53:13 +00:00
|
|
|
local_irq_disable();
|
2021-02-20 23:17:05 +00:00
|
|
|
flush_tlb_func(info);
|
2017-06-29 15:53:13 +00:00
|
|
|
local_irq_enable();
|
|
|
|
}
|
|
|
|
|
2019-04-25 23:01:43 +00:00
|
|
|
put_flush_tlb_info();
|
2017-05-28 17:00:12 +00:00
|
|
|
put_cpu();
|
2008-03-03 17:12:54 +00:00
|
|
|
}
|
|
|
|
|
2017-05-28 17:00:10 +00:00
|
|
|
|
2008-03-03 17:12:54 +00:00
|
|
|
static void do_flush_tlb_all(void *info)
|
|
|
|
{
|
2014-01-21 22:33:16 +00:00
|
|
|
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
|
2008-03-03 17:12:54 +00:00
|
|
|
__flush_tlb_all();
|
|
|
|
}
|
|
|
|
|
|
|
|
void flush_tlb_all(void)
|
|
|
|
{
|
2014-01-21 22:33:16 +00:00
|
|
|
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
2008-05-09 07:39:44 +00:00
|
|
|
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
2008-03-03 17:12:54 +00:00
|
|
|
}
|
2012-06-28 01:02:20 +00:00
|
|
|
|
2012-06-28 01:02:24 +00:00
|
|
|
static void do_kernel_range_flush(void *info)
|
|
|
|
{
|
|
|
|
struct flush_tlb_info *f = info;
|
|
|
|
unsigned long addr;
|
|
|
|
|
|
|
|
/* flush range by one by one 'invlpg' */
|
2017-05-28 17:00:10 +00:00
|
|
|
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
|
2020-04-21 09:20:35 +00:00
|
|
|
flush_tlb_one_kernel(addr);
|
2012-06-28 01:02:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
|
|
|
{
|
|
|
|
/* Balance as user space task's flush, a bit conservative */
|
x86/mm: Rip out complicated, out-of-date, buggy TLB flushing
I think the flush_tlb_mm_range() code that tries to tune the
flush sizes based on the CPU needs to get ripped out for
several reasons:
1. It is obviously buggy. It uses mm->total_vm to judge the
task's footprint in the TLB. It should certainly be using
some measure of RSS, *NOT* ->total_vm since only resident
memory can populate the TLB.
2. Haswell, and several other CPUs are missing from the
intel_tlb_flushall_shift_set() function. Thus, it has been
demonstrated to bitrot quickly in practice.
3. It is plain wrong in my vm:
[ 0.037444] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] tlb_flushall_shift: 6
Which leads to it to never use invlpg.
4. The assumptions about TLB refill costs are wrong:
http://lkml.kernel.org/r/1337782555-8088-3-git-send-email-alex.shi@intel.com
(more on this in later patches)
5. I can not reproduce the original data: https://lkml.org/lkml/2012/5/17/59
I believe the sample times were too short. Running the
benchmark in a loop yields times that vary quite a bit.
Note that this leaves us with a static ceiling of 1 page. This
is a conservative, dumb setting, and will be revised in a later
patch.
This also removes the code which attempts to predict whether we
are flushing data or instructions. We expect instruction flushes
to be relatively rare and not worth tuning for explicitly.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154055.ABC88E89@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2014-07-31 15:40:55 +00:00
|
|
|
if (end == TLB_FLUSH_ALL ||
|
2017-05-28 17:00:16 +00:00
|
|
|
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
|
2012-06-28 01:02:24 +00:00
|
|
|
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
x86/mm: Rip out complicated, out-of-date, buggy TLB flushing
I think the flush_tlb_mm_range() code that tries to tune the
flush sizes based on the CPU needs to get ripped out for
several reasons:
1. It is obviously buggy. It uses mm->total_vm to judge the
task's footprint in the TLB. It should certainly be using
some measure of RSS, *NOT* ->total_vm since only resident
memory can populate the TLB.
2. Haswell, and several other CPUs are missing from the
intel_tlb_flushall_shift_set() function. Thus, it has been
demonstrated to bitrot quickly in practice.
3. It is plain wrong in my vm:
[ 0.037444] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] tlb_flushall_shift: 6
Which leads to it to never use invlpg.
4. The assumptions about TLB refill costs are wrong:
http://lkml.kernel.org/r/1337782555-8088-3-git-send-email-alex.shi@intel.com
(more on this in later patches)
5. I can not reproduce the original data: https://lkml.org/lkml/2012/5/17/59
I believe the sample times were too short. Running the
benchmark in a loop yields times that vary quite a bit.
Note that this leaves us with a static ceiling of 1 page. This
is a conservative, dumb setting, and will be revised in a later
patch.
This also removes the code which attempts to predict whether we
are flushing data or instructions. We expect instruction flushes
to be relatively rare and not worth tuning for explicitly.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154055.ABC88E89@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2014-07-31 15:40:55 +00:00
|
|
|
} else {
|
2019-04-25 23:01:43 +00:00
|
|
|
struct flush_tlb_info *info;
|
|
|
|
|
|
|
|
preempt_disable();
|
|
|
|
info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
|
|
|
|
|
|
|
|
on_each_cpu(do_kernel_range_flush, info, 1);
|
|
|
|
|
|
|
|
put_flush_tlb_info();
|
|
|
|
preempt_enable();
|
2012-06-28 01:02:24 +00:00
|
|
|
}
|
|
|
|
}
|
2014-07-31 15:41:01 +00:00
|
|
|
|
2020-04-21 09:20:28 +00:00
|
|
|
/*
|
|
|
|
* This can be used from process context to figure out what the value of
|
|
|
|
* CR3 is without needing to do a (slow) __read_cr3().
|
|
|
|
*
|
|
|
|
* It's intended to be used for code like KVM that sneakily changes CR3
|
|
|
|
* and needs to restore it. It needs to be used very carefully.
|
|
|
|
*/
|
|
|
|
unsigned long __get_current_cr3_fast(void)
|
|
|
|
{
|
|
|
|
unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
|
|
|
|
this_cpu_read(cpu_tlbstate.loaded_mm_asid));
|
|
|
|
|
|
|
|
/* For now, be very restrictive about when this can be called. */
|
|
|
|
VM_WARN_ON(in_nmi() || preemptible());
|
|
|
|
|
|
|
|
VM_BUG_ON(cr3 != __read_cr3());
|
|
|
|
return cr3;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
|
|
|
|
|
2020-04-21 09:20:35 +00:00
|
|
|
/*
|
|
|
|
* Flush one page in the kernel mapping
|
|
|
|
*/
|
|
|
|
void flush_tlb_one_kernel(unsigned long addr)
|
|
|
|
{
|
|
|
|
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
|
|
|
|
* paravirt equivalent. Even with PCID, this is sufficient: we only
|
|
|
|
* use PCID if we also use global PTEs for the kernel mapping, and
|
|
|
|
* INVLPG flushes global translations across all address spaces.
|
|
|
|
*
|
|
|
|
* If PTI is on, then the kernel is mapped with non-global PTEs, and
|
|
|
|
* __flush_tlb_one_user() will flush the given address for the current
|
|
|
|
* kernel address space and for its usermode counterpart, but it does
|
|
|
|
* not flush it for other address spaces.
|
|
|
|
*/
|
|
|
|
flush_tlb_one_user(addr);
|
|
|
|
|
|
|
|
if (!static_cpu_has(X86_FEATURE_PTI))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* See above. We need to propagate the flush to all other address
|
|
|
|
* spaces. In principle, we only need to propagate it to kernelmode
|
|
|
|
* address spaces, but the extra bookkeeping we would need is not
|
|
|
|
* worth it.
|
|
|
|
*/
|
|
|
|
this_cpu_write(cpu_tlbstate.invalidate_other, true);
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:34 +00:00
|
|
|
/*
|
|
|
|
* Flush one page in the user mapping
|
|
|
|
*/
|
|
|
|
STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
|
|
|
|
{
|
|
|
|
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
|
|
|
|
|
|
|
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
|
|
|
|
|
|
|
|
if (!static_cpu_has(X86_FEATURE_PTI))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
|
|
|
|
* Just use invalidate_user_asid() in case we are called early.
|
|
|
|
*/
|
|
|
|
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
|
|
|
|
invalidate_user_asid(loaded_mm_asid);
|
|
|
|
else
|
|
|
|
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void flush_tlb_one_user(unsigned long addr)
|
|
|
|
{
|
|
|
|
__flush_tlb_one_user(addr);
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:33 +00:00
|
|
|
/*
|
|
|
|
* Flush everything
|
|
|
|
*/
|
|
|
|
STATIC_NOPV void native_flush_tlb_global(void)
|
|
|
|
{
|
|
|
|
unsigned long cr4, flags;
|
|
|
|
|
|
|
|
if (static_cpu_has(X86_FEATURE_INVPCID)) {
|
|
|
|
/*
|
|
|
|
* Using INVPCID is considerably faster than a pair of writes
|
|
|
|
* to CR4 sandwiched inside an IRQ flag save/restore.
|
|
|
|
*
|
|
|
|
* Note, this works with CR4.PCIDE=0 or 1.
|
|
|
|
*/
|
|
|
|
invpcid_flush_all();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read-modify-write to CR4 - protect it from preemption and
|
|
|
|
* from interrupts. (Use the raw variant because this code can
|
|
|
|
* be called from deep inside debugging code.)
|
|
|
|
*/
|
|
|
|
raw_local_irq_save(flags);
|
|
|
|
|
|
|
|
cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
|
|
|
/* toggle PGE */
|
|
|
|
native_write_cr4(cr4 ^ X86_CR4_PGE);
|
|
|
|
/* write old PGE again and flush TLBs */
|
|
|
|
native_write_cr4(cr4);
|
|
|
|
|
|
|
|
raw_local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:32 +00:00
|
|
|
/*
|
|
|
|
* Flush the entire current user mapping
|
|
|
|
*/
|
|
|
|
STATIC_NOPV void native_flush_tlb_local(void)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Preemption or interrupts must be disabled to protect the access
|
|
|
|
* to the per CPU variable and to prevent being preempted between
|
|
|
|
* read_cr3() and write_cr3().
|
|
|
|
*/
|
|
|
|
WARN_ON_ONCE(preemptible());
|
|
|
|
|
|
|
|
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
|
|
|
|
|
|
|
|
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
|
|
|
|
native_write_cr3(__native_read_cr3());
|
|
|
|
}
|
|
|
|
|
|
|
|
void flush_tlb_local(void)
|
|
|
|
{
|
|
|
|
__flush_tlb_local();
|
|
|
|
}
|
2020-04-21 09:20:37 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush everything
|
|
|
|
*/
|
|
|
|
void __flush_tlb_all(void)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This is to catch users with enabled preemption and the PGE feature
|
|
|
|
* and don't trigger the warning in __native_flush_tlb().
|
|
|
|
*/
|
|
|
|
VM_WARN_ON_ONCE(preemptible());
|
|
|
|
|
|
|
|
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
|
|
|
__flush_tlb_global();
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* !PGE -> !PCID (setup_pcid()), thus every flush is total.
|
|
|
|
*/
|
|
|
|
flush_tlb_local();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__flush_tlb_all);
|
2020-04-21 09:20:32 +00:00
|
|
|
|
2017-05-22 22:30:03 +00:00
|
|
|
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
|
|
|
|
{
|
2021-02-20 23:17:05 +00:00
|
|
|
struct flush_tlb_info *info;
|
|
|
|
|
2017-05-22 22:30:03 +00:00
|
|
|
int cpu = get_cpu();
|
|
|
|
|
2021-02-20 23:17:05 +00:00
|
|
|
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
|
2021-02-20 23:17:07 +00:00
|
|
|
/*
|
|
|
|
* flush_tlb_multi() is not optimized for the common case in which only
|
|
|
|
* a local TLB flush is needed. Optimize this use-case by calling
|
|
|
|
* flush_tlb_func_local() directly in this case.
|
|
|
|
*/
|
|
|
|
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
|
|
|
flush_tlb_multi(&batch->cpumask, info);
|
|
|
|
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
|
2019-04-25 23:01:43 +00:00
|
|
|
lockdep_assert_irqs_enabled();
|
2017-06-29 15:53:13 +00:00
|
|
|
local_irq_disable();
|
2021-02-20 23:17:05 +00:00
|
|
|
flush_tlb_func(info);
|
2017-06-29 15:53:13 +00:00
|
|
|
local_irq_enable();
|
|
|
|
}
|
|
|
|
|
2017-05-22 22:30:03 +00:00
|
|
|
cpumask_clear(&batch->cpumask);
|
|
|
|
|
2021-02-20 23:17:05 +00:00
|
|
|
put_flush_tlb_info();
|
2017-05-22 22:30:03 +00:00
|
|
|
put_cpu();
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:40 +00:00
|
|
|
/*
|
|
|
|
* Blindly accessing user memory from NMI context can be dangerous
|
|
|
|
* if we're in the middle of switching the current user task or
|
|
|
|
* switching the loaded mm. It can also be dangerous if we
|
|
|
|
* interrupted some kernel code that was temporarily using a
|
|
|
|
* different mm.
|
|
|
|
*/
|
|
|
|
bool nmi_uaccess_okay(void)
|
|
|
|
{
|
|
|
|
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
|
|
|
struct mm_struct *current_mm = current->mm;
|
|
|
|
|
|
|
|
VM_WARN_ON_ONCE(!loaded_mm);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The condition we want to check is
|
|
|
|
* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
|
|
|
|
* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
|
|
|
|
* is supposed to be reasonably fast.
|
|
|
|
*
|
|
|
|
* Instead, we check the almost equivalent but somewhat conservative
|
|
|
|
* condition below, and we rely on the fact that switch_mm_irqs_off()
|
|
|
|
* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
|
|
|
|
*/
|
|
|
|
if (loaded_mm != current_mm)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-07-31 15:41:01 +00:00
|
|
|
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
|
|
|
|
size_t count, loff_t *ppos)
|
|
|
|
{
|
|
|
|
char buf[32];
|
|
|
|
unsigned int len;
|
|
|
|
|
|
|
|
len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
|
|
|
|
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t tlbflush_write_file(struct file *file,
|
|
|
|
const char __user *user_buf, size_t count, loff_t *ppos)
|
|
|
|
{
|
|
|
|
char buf[32];
|
|
|
|
ssize_t len;
|
|
|
|
int ceiling;
|
|
|
|
|
|
|
|
len = min(count, sizeof(buf) - 1);
|
|
|
|
if (copy_from_user(buf, user_buf, len))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
buf[len] = '\0';
|
|
|
|
if (kstrtoint(buf, 0, &ceiling))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (ceiling < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
tlb_single_page_flush_ceiling = ceiling;
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations fops_tlbflush = {
|
|
|
|
.read = tlbflush_read_file,
|
|
|
|
.write = tlbflush_write_file,
|
|
|
|
.llseek = default_llseek,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init create_tlb_single_page_flush_ceiling(void)
|
|
|
|
{
|
|
|
|
debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
|
|
|
|
arch_debugfs_dir, NULL, &fops_tlbflush);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
late_initcall(create_tlb_single_page_flush_ceiling);
|