Merge branch 'x86/cache' into perf/core, to resolve conflicts

Avoid conflict with upcoming perf/core patches, merge in the RDT perf work.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2018-10-02 09:51:41 +02:00
commit a4c9f26533
4 changed files with 257 additions and 164 deletions

View File

@ -520,18 +520,24 @@ the pseudo-locked region:
2) Cache hit and miss measurements using model specific precision counters if
available. Depending on the levels of cache on the system the pseudo_lock_l2
and pseudo_lock_l3 tracepoints are available.
WARNING: triggering this measurement uses from two (for just L2
measurements) to four (for L2 and L3 measurements) precision counters on
the system, if any other measurements are in progress the counters and
their corresponding event registers will be clobbered.
When a pseudo-locked region is created a new debugfs directory is created for
it in debugfs as /sys/kernel/debug/resctrl/<newdir>. A single
write-only file, pseudo_lock_measure, is present in this directory. The
measurement on the pseudo-locked region depends on the number, 1 or 2,
written to this debugfs file. Since the measurements are recorded with the
tracing infrastructure the relevant tracepoints need to be enabled before the
measurement is triggered.
measurement of the pseudo-locked region depends on the number written to this
debugfs file:
1 - writing "1" to the pseudo_lock_measure file will trigger the latency
measurement captured in the pseudo_lock_mem_latency tracepoint. See
example below.
2 - writing "2" to the pseudo_lock_measure file will trigger the L2 cache
residency (cache hits and misses) measurement captured in the
pseudo_lock_l2 tracepoint. See example below.
3 - writing "3" to the pseudo_lock_measure file will trigger the L3 cache
residency (cache hits and misses) measurement captured in the
pseudo_lock_l3 tracepoint.
All measurements are recorded with the tracing infrastructure. This requires
the relevant tracepoints to be enabled before the measurement is triggered.
Example of latency debugging interface:
In this example a pseudo-locked region named "newlock" was created. Here is

View File

@ -1033,6 +1033,27 @@ static inline void x86_assign_hw_event(struct perf_event *event,
}
}
/**
* x86_perf_rdpmc_index - Return PMC counter used for event
* @event: the perf_event to which the PMC counter was assigned
*
* The counter assigned to this performance event may change if interrupts
* are enabled. This counter should thus never be used while interrupts are
* enabled. Before this function is used to obtain the assigned counter the
* event should be checked for validity using, for example,
* perf_event_read_local(), within the same interrupt disabled section in
* which this counter is planned to be used.
*
* Return: The index of the performance monitoring counter assigned to
* @perf_event.
*/
int x86_perf_rdpmc_index(struct perf_event *event)
{
lockdep_assert_irqs_disabled();
return event->hw.event_base_rdpmc;
}
static inline int match_prev_assignment(struct hw_perf_event *hwc,
struct cpu_hw_events *cpuc,
int i)

View File

@ -278,6 +278,7 @@ struct perf_guest_switch_msr {
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
extern void perf_check_microcode(void);
extern int x86_perf_rdpmc_index(struct perf_event *event);
#else
static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
{

View File

@ -17,6 +17,7 @@
#include <linux/debugfs.h>
#include <linux/kthread.h>
#include <linux/mman.h>
#include <linux/perf_event.h>
#include <linux/pm_qos.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
@ -26,6 +27,7 @@
#include <asm/intel_rdt_sched.h>
#include <asm/perf_event.h>
#include "../../events/perf_event.h" /* For X86_CONFIG() */
#include "intel_rdt.h"
#define CREATE_TRACE_POINTS
@ -106,16 +108,6 @@ static u64 get_prefetch_disable_bits(void)
return 0;
}
/*
* Helper to write 64bit value to MSR without tracing. Used when
* use of the cache should be restricted and use of registers used
* for local variables avoided.
*/
static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val)
{
__wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
}
/**
* pseudo_lock_minor_get - Obtain available minor number
* @minor: Pointer to where new minor number will be stored
@ -886,31 +878,14 @@ static int measure_cycles_lat_fn(void *_plr)
struct pseudo_lock_region *plr = _plr;
unsigned long i;
u64 start, end;
#ifdef CONFIG_KASAN
/*
* The registers used for local register variables are also used
* when KASAN is active. When KASAN is active we use a regular
* variable to ensure we always use a valid pointer to access memory.
* The cost is that accessing this pointer, which could be in
* cache, will be included in the measurement of memory read latency.
*/
void *mem_r;
#else
#ifdef CONFIG_X86_64
register void *mem_r asm("rbx");
#else
register void *mem_r asm("ebx");
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_KASAN */
local_irq_disable();
/*
* The wrmsr call may be reordered with the assignment below it.
* Call wrmsr as directly as possible to avoid tracing clobbering
* local register variable used for memory pointer.
* Disable hardware prefetchers.
*/
__wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
mem_r = plr->kmem;
wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
mem_r = READ_ONCE(plr->kmem);
/*
* Dummy execute of the time measurement to load the needed
* instructions into the L1 instruction cache.
@ -932,157 +907,240 @@ static int measure_cycles_lat_fn(void *_plr)
return 0;
}
static int measure_cycles_perf_fn(void *_plr)
/*
* Create a perf_event_attr for the hit and miss perf events that will
* be used during the performance measurement. A perf_event maintains
* a pointer to its perf_event_attr so a unique attribute structure is
* created for each perf_event.
*
* The actual configuration of the event is set right before use in order
* to use the X86_CONFIG macro.
*/
static struct perf_event_attr perf_miss_attr = {
.type = PERF_TYPE_RAW,
.size = sizeof(struct perf_event_attr),
.pinned = 1,
.disabled = 0,
.exclude_user = 1,
};
static struct perf_event_attr perf_hit_attr = {
.type = PERF_TYPE_RAW,
.size = sizeof(struct perf_event_attr),
.pinned = 1,
.disabled = 0,
.exclude_user = 1,
};
struct residency_counts {
u64 miss_before, hits_before;
u64 miss_after, hits_after;
};
static int measure_residency_fn(struct perf_event_attr *miss_attr,
struct perf_event_attr *hit_attr,
struct pseudo_lock_region *plr,
struct residency_counts *counts)
{
unsigned long long l3_hits = 0, l3_miss = 0;
u64 l3_hit_bits = 0, l3_miss_bits = 0;
struct pseudo_lock_region *plr = _plr;
unsigned long long l2_hits, l2_miss;
u64 l2_hit_bits, l2_miss_bits;
unsigned long i;
#ifdef CONFIG_KASAN
/*
* The registers used for local register variables are also used
* when KASAN is active. When KASAN is active we use regular variables
* at the cost of including cache access latency to these variables
* in the measurements.
*/
u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
struct perf_event *miss_event, *hit_event;
int hit_pmcnum, miss_pmcnum;
unsigned int line_size;
unsigned int size;
unsigned long i;
void *mem_r;
#else
register unsigned int line_size asm("esi");
register unsigned int size asm("edi");
#ifdef CONFIG_X86_64
register void *mem_r asm("rbx");
#else
register void *mem_r asm("ebx");
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_KASAN */
u64 tmp;
/*
* Non-architectural event for the Goldmont Microarchitecture
* from Intel x86 Architecture Software Developer Manual (SDM):
* MEM_LOAD_UOPS_RETIRED D1H (event number)
* Umask values:
* L1_HIT 01H
* L2_HIT 02H
* L1_MISS 08H
* L2_MISS 10H
*
* On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
* has two "no fix" errata associated with it: BDM35 and BDM100. On
* this platform we use the following events instead:
* L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/)
* REFERENCES FFH
* MISS 3FH
* LONGEST_LAT_CACHE 2EH (Documented in SDM)
* REFERENCE 4FH
* MISS 41H
*/
/*
* Start by setting flags for IA32_PERFEVTSELx:
* OS (Operating system mode) 0x2
* INT (APIC interrupt enable) 0x10
* EN (Enable counter) 0x40
*
* Then add the Umask value and event number to select performance
* event.
*/
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_ATOM_GEMINI_LAKE:
l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1;
l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1;
break;
case INTEL_FAM6_BROADWELL_X:
/* On BDW the l2_hit_bits count references, not hits */
l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24;
l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24;
/* On BDW the l3_hit_bits count references, not hits */
l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e;
l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e;
break;
default:
miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
NULL, NULL, NULL);
if (IS_ERR(miss_event))
goto out;
}
hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
NULL, NULL, NULL);
if (IS_ERR(hit_event))
goto out_miss;
local_irq_disable();
/*
* Call wrmsr direcly to avoid the local register variables from
* being overwritten due to reordering of their assignment with
* the wrmsr calls.
* Check any possible error state of events used by performing
* one local read.
*/
__wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
/* Disable events and reset counters */
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0);
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0);
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0);
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0);
if (l3_hit_bits > 0) {
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0);
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0);
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0);
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0);
if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
local_irq_enable();
goto out_hit;
}
/* Set and enable the L2 counters */
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits);
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits);
if (l3_hit_bits > 0) {
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
l3_hit_bits);
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
l3_miss_bits);
if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
local_irq_enable();
goto out_hit;
}
mem_r = plr->kmem;
size = plr->size;
line_size = plr->line_size;
/*
* Disable hardware prefetchers.
*/
wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
/* Initialize rest of local variables */
/*
* Performance event has been validated right before this with
* interrupts disabled - it is thus safe to read the counter index.
*/
miss_pmcnum = x86_perf_rdpmc_index(miss_event);
hit_pmcnum = x86_perf_rdpmc_index(hit_event);
line_size = READ_ONCE(plr->line_size);
mem_r = READ_ONCE(plr->kmem);
size = READ_ONCE(plr->size);
/*
* Read counter variables twice - first to load the instructions
* used in L1 cache, second to capture accurate value that does not
* include cache misses incurred because of instruction loads.
*/
rdpmcl(hit_pmcnum, hits_before);
rdpmcl(miss_pmcnum, miss_before);
/*
* From SDM: Performing back-to-back fast reads are not guaranteed
* to be monotonic.
* Use LFENCE to ensure all previous instructions are retired
* before proceeding.
*/
rmb();
rdpmcl(hit_pmcnum, hits_before);
rdpmcl(miss_pmcnum, miss_before);
/*
* Use LFENCE to ensure all previous instructions are retired
* before proceeding.
*/
rmb();
for (i = 0; i < size; i += line_size) {
/*
* Add a barrier to prevent speculative execution of this
* loop reading beyond the end of the buffer.
*/
rmb();
asm volatile("mov (%0,%1,1), %%eax\n\t"
:
: "r" (mem_r), "r" (i)
: "%eax", "memory");
}
/*
* Call wrmsr directly (no tracing) to not influence
* the cache access counters as they are disabled.
* Use LFENCE to ensure all previous instructions are retired
* before proceeding.
*/
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0,
l2_hit_bits & ~(0x40ULL << 16));
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1,
l2_miss_bits & ~(0x40ULL << 16));
if (l3_hit_bits > 0) {
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
l3_hit_bits & ~(0x40ULL << 16));
pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
l3_miss_bits & ~(0x40ULL << 16));
}
l2_hits = native_read_pmc(0);
l2_miss = native_read_pmc(1);
if (l3_hit_bits > 0) {
l3_hits = native_read_pmc(2);
l3_miss = native_read_pmc(3);
}
rmb();
rdpmcl(hit_pmcnum, hits_after);
rdpmcl(miss_pmcnum, miss_after);
/*
* Use LFENCE to ensure all previous instructions are retired
* before proceeding.
*/
rmb();
/* Re-enable hardware prefetchers */
wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
local_irq_enable();
out_hit:
perf_event_release_kernel(hit_event);
out_miss:
perf_event_release_kernel(miss_event);
out:
/*
* On BDW we count references and misses, need to adjust. Sometimes
* the "hits" counter is a bit more than the references, for
* example, x references but x + 1 hits. To not report invalid
* hit values in this case we treat that as misses eaqual to
* references.
* All counts will be zero on failure.
*/
if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss);
trace_pseudo_lock_l2(l2_hits, l2_miss);
if (l3_hit_bits > 0) {
if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss);
trace_pseudo_lock_l3(l3_hits, l3_miss);
counts->miss_before = miss_before;
counts->hits_before = hits_before;
counts->miss_after = miss_after;
counts->hits_after = hits_after;
return 0;
}
static int measure_l2_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
/*
* Non-architectural event for the Goldmont Microarchitecture
* from Intel x86 Architecture Software Developer Manual (SDM):
* MEM_LOAD_UOPS_RETIRED D1H (event number)
* Umask values:
* L2_HIT 02H
* L2_MISS 10H
*/
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_ATOM_GEMINI_LAKE:
perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
.umask = 0x10);
perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
.umask = 0x2);
break;
default:
goto out;
}
measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
/*
* If a failure prevented the measurements from succeeding
* tracepoints will still be written and all counts will be zero.
*/
trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
counts.miss_after - counts.miss_before);
out:
plr->thread_done = 1;
wake_up_interruptible(&plr->lock_thread_wq);
return 0;
}
static int measure_l3_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
/*
* On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
* has two "no fix" errata associated with it: BDM35 and BDM100. On
* this platform the following events are used instead:
* LONGEST_LAT_CACHE 2EH (Documented in SDM)
* REFERENCE 4FH
* MISS 41H
*/
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_BROADWELL_X:
/* On BDW the hit event counts references, not hits */
perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
.umask = 0x4f);
perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
.umask = 0x41);
break;
default:
goto out;
}
measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
/*
* If a failure prevented the measurements from succeeding
* tracepoints will still be written and all counts will be zero.
*/
counts.miss_after -= counts.miss_before;
if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) {
/*
* On BDW references and misses are counted, need to adjust.
* Sometimes the "hits" counter is a bit more than the
* references, for example, x references but x + 1 hits.
* To not report invalid hit values in this case we treat
* that as misses equal to references.
*/
/* First compute the number of cache references measured */
counts.hits_after -= counts.hits_before;
/* Next convert references to cache hits */
counts.hits_after -= min(counts.miss_after, counts.hits_after);
} else {
counts.hits_after -= counts.hits_before;
}
trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
out:
plr->thread_done = 1;
wake_up_interruptible(&plr->lock_thread_wq);
@ -1121,13 +1179,20 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
goto out;
}
plr->cpu = cpu;
if (sel == 1)
thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
cpu_to_node(cpu),
"pseudo_lock_measure/%u",
cpu);
else if (sel == 2)
thread = kthread_create_on_node(measure_cycles_perf_fn, plr,
thread = kthread_create_on_node(measure_l2_residency, plr,
cpu_to_node(cpu),
"pseudo_lock_measure/%u",
cpu);
else if (sel == 3)
thread = kthread_create_on_node(measure_l3_residency, plr,
cpu_to_node(cpu),
"pseudo_lock_measure/%u",
cpu);