mirror of
https://github.com/torvalds/linux.git
synced 2024-12-04 18:13:04 +00:00
Merge branch 'for-next/perf-user-counter-access' into for-next/perf
* for-next/perf-user-counter-access: Documentation: arm64: Document PMU counters access from userspace arm64: perf: Enable PMU counter userspace access for perf event arm64: perf: Add userspace counter access disable switch perf: Add a counter for number of user access events in context x86: perf: Move RDPMC event flag to a common definition
This commit is contained in:
commit
8bd09b41b8
@ -905,6 +905,17 @@ enabled, otherwise writing to this file will return ``-EBUSY``.
|
||||
The default value is 8.
|
||||
|
||||
|
||||
perf_user_access (arm64 only)
|
||||
=================================
|
||||
|
||||
Controls user space access for reading perf event counters. When set to 1,
|
||||
user space can read performance monitor counter registers directly.
|
||||
|
||||
The default value is 0 (access disabled).
|
||||
|
||||
See Documentation/arm64/perf.rst for more information.
|
||||
|
||||
|
||||
pid_max
|
||||
=======
|
||||
|
||||
|
@ -2,7 +2,10 @@
|
||||
|
||||
.. _perf_index:
|
||||
|
||||
=====================
|
||||
====
|
||||
Perf
|
||||
====
|
||||
|
||||
Perf Event Attributes
|
||||
=====================
|
||||
|
||||
@ -88,3 +91,76 @@ exclude_host. However when using !exclude_hv there is a small blackout
|
||||
window at the guest entry/exit where host events are not captured.
|
||||
|
||||
On VHE systems there are no blackout windows.
|
||||
|
||||
Perf Userspace PMU Hardware Counter Access
|
||||
==========================================
|
||||
|
||||
Overview
|
||||
--------
|
||||
The perf userspace tool relies on the PMU to monitor events. It offers an
|
||||
abstraction layer over the hardware counters since the underlying
|
||||
implementation is cpu-dependent.
|
||||
Arm64 allows userspace tools to have access to the registers storing the
|
||||
hardware counters' values directly.
|
||||
|
||||
This targets specifically self-monitoring tasks in order to reduce the overhead
|
||||
by directly accessing the registers without having to go through the kernel.
|
||||
|
||||
How-to
|
||||
------
|
||||
The focus is set on the armv8 PMUv3 which makes sure that the access to the pmu
|
||||
registers is enabled and that the userspace has access to the relevant
|
||||
information in order to use them.
|
||||
|
||||
In order to have access to the hardware counters, the global sysctl
|
||||
kernel/perf_user_access must first be enabled:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
echo 1 > /proc/sys/kernel/perf_user_access
|
||||
|
||||
It is necessary to open the event using the perf tool interface with config1:1
|
||||
attr bit set: the sys_perf_event_open syscall returns a fd which can
|
||||
subsequently be used with the mmap syscall in order to retrieve a page of memory
|
||||
containing information about the event. The PMU driver uses this page to expose
|
||||
to the user the hardware counter's index and other necessary data. Using this
|
||||
index enables the user to access the PMU registers using the `mrs` instruction.
|
||||
Access to the PMU registers is only valid while the sequence lock is unchanged.
|
||||
In particular, the PMSELR_EL0 register is zeroed each time the sequence lock is
|
||||
changed.
|
||||
|
||||
The userspace access is supported in libperf using the perf_evsel__mmap()
|
||||
and perf_evsel__read() functions. See `tools/lib/perf/tests/test-evsel.c`_ for
|
||||
an example.
|
||||
|
||||
About heterogeneous systems
|
||||
---------------------------
|
||||
On heterogeneous systems such as big.LITTLE, userspace PMU counter access can
|
||||
only be enabled when the tasks are pinned to a homogeneous subset of cores and
|
||||
the corresponding PMU instance is opened by specifying the 'type' attribute.
|
||||
The use of generic event types is not supported in this case.
|
||||
|
||||
Have a look at `tools/perf/arch/arm64/tests/user-events.c`_ for an example. It
|
||||
can be run using the perf tool to check that the access to the registers works
|
||||
correctly from userspace:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
perf test -v user
|
||||
|
||||
About chained events and counter sizes
|
||||
--------------------------------------
|
||||
The user can request either a 32-bit (config1:0 == 0) or 64-bit (config1:0 == 1)
|
||||
counter along with userspace access. The sys_perf_event_open syscall will fail
|
||||
if a 64-bit counter is requested and the hardware doesn't support 64-bit
|
||||
counters. Chained events are not supported in conjunction with userspace counter
|
||||
access. If a 32-bit counter is requested on hardware with 64-bit counters, then
|
||||
userspace must treat the upper 32-bits read from the counter as UNKNOWN. The
|
||||
'pmc_width' field in the user page will indicate the valid width of the counter
|
||||
and should be used to mask the upper bits as needed.
|
||||
|
||||
.. Links
|
||||
.. _tools/perf/arch/arm64/tests/user-events.c:
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/arch/arm64/tests/user-events.c
|
||||
.. _tools/lib/perf/tests/test-evsel.c:
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/perf/tests/test-evsel.c
|
||||
|
@ -285,15 +285,24 @@ static const struct attribute_group armv8_pmuv3_events_attr_group = {
|
||||
|
||||
PMU_FORMAT_ATTR(event, "config:0-15");
|
||||
PMU_FORMAT_ATTR(long, "config1:0");
|
||||
PMU_FORMAT_ATTR(rdpmc, "config1:1");
|
||||
|
||||
static int sysctl_perf_user_access __read_mostly;
|
||||
|
||||
static inline bool armv8pmu_event_is_64bit(struct perf_event *event)
|
||||
{
|
||||
return event->attr.config1 & 0x1;
|
||||
}
|
||||
|
||||
static inline bool armv8pmu_event_want_user_access(struct perf_event *event)
|
||||
{
|
||||
return event->attr.config1 & 0x2;
|
||||
}
|
||||
|
||||
static struct attribute *armv8_pmuv3_format_attrs[] = {
|
||||
&format_attr_event.attr,
|
||||
&format_attr_long.attr,
|
||||
&format_attr_rdpmc.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -362,7 +371,7 @@ static const struct attribute_group armv8_pmuv3_caps_attr_group = {
|
||||
*/
|
||||
#define ARMV8_IDX_CYCLE_COUNTER 0
|
||||
#define ARMV8_IDX_COUNTER0 1
|
||||
|
||||
#define ARMV8_IDX_CYCLE_COUNTER_USER 32
|
||||
|
||||
/*
|
||||
* We unconditionally enable ARMv8.5-PMU long event counter support
|
||||
@ -374,18 +383,22 @@ static bool armv8pmu_has_long_event(struct arm_pmu *cpu_pmu)
|
||||
return (cpu_pmu->pmuver >= ID_AA64DFR0_PMUVER_8_5);
|
||||
}
|
||||
|
||||
static inline bool armv8pmu_event_has_user_read(struct perf_event *event)
|
||||
{
|
||||
return event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT;
|
||||
}
|
||||
|
||||
/*
|
||||
* We must chain two programmable counters for 64 bit events,
|
||||
* except when we have allocated the 64bit cycle counter (for CPU
|
||||
* cycles event). This must be called only when the event has
|
||||
* a counter allocated.
|
||||
* cycles event) or when user space counter access is enabled.
|
||||
*/
|
||||
static inline bool armv8pmu_event_is_chained(struct perf_event *event)
|
||||
{
|
||||
int idx = event->hw.idx;
|
||||
struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
|
||||
|
||||
return !WARN_ON(idx < 0) &&
|
||||
return !armv8pmu_event_has_user_read(event) &&
|
||||
armv8pmu_event_is_64bit(event) &&
|
||||
!armv8pmu_has_long_event(cpu_pmu) &&
|
||||
(idx != ARMV8_IDX_CYCLE_COUNTER);
|
||||
@ -718,6 +731,28 @@ static inline u32 armv8pmu_getreset_flags(void)
|
||||
return value;
|
||||
}
|
||||
|
||||
static void armv8pmu_disable_user_access(void)
|
||||
{
|
||||
write_sysreg(0, pmuserenr_el0);
|
||||
}
|
||||
|
||||
static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
|
||||
{
|
||||
int i;
|
||||
struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
|
||||
|
||||
/* Clear any unused counters to avoid leaking their contents */
|
||||
for_each_clear_bit(i, cpuc->used_mask, cpu_pmu->num_events) {
|
||||
if (i == ARMV8_IDX_CYCLE_COUNTER)
|
||||
write_sysreg(0, pmccntr_el0);
|
||||
else
|
||||
armv8pmu_write_evcntr(i, 0);
|
||||
}
|
||||
|
||||
write_sysreg(0, pmuserenr_el0);
|
||||
write_sysreg(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR, pmuserenr_el0);
|
||||
}
|
||||
|
||||
static void armv8pmu_enable_event(struct perf_event *event)
|
||||
{
|
||||
/*
|
||||
@ -761,6 +796,14 @@ static void armv8pmu_disable_event(struct perf_event *event)
|
||||
|
||||
static void armv8pmu_start(struct arm_pmu *cpu_pmu)
|
||||
{
|
||||
struct perf_event_context *task_ctx =
|
||||
this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx;
|
||||
|
||||
if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user)
|
||||
armv8pmu_enable_user_access(cpu_pmu);
|
||||
else
|
||||
armv8pmu_disable_user_access();
|
||||
|
||||
/* Enable all counters */
|
||||
armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
|
||||
}
|
||||
@ -878,13 +921,16 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
|
||||
if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) {
|
||||
if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask))
|
||||
return ARMV8_IDX_CYCLE_COUNTER;
|
||||
else if (armv8pmu_event_is_64bit(event) &&
|
||||
armv8pmu_event_want_user_access(event) &&
|
||||
!armv8pmu_has_long_event(cpu_pmu))
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
/*
|
||||
* Otherwise use events counters
|
||||
*/
|
||||
if (armv8pmu_event_is_64bit(event) &&
|
||||
!armv8pmu_has_long_event(cpu_pmu))
|
||||
if (armv8pmu_event_is_chained(event))
|
||||
return armv8pmu_get_chain_idx(cpuc, cpu_pmu);
|
||||
else
|
||||
return armv8pmu_get_single_idx(cpuc, cpu_pmu);
|
||||
@ -900,6 +946,22 @@ static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc,
|
||||
clear_bit(idx - 1, cpuc->used_mask);
|
||||
}
|
||||
|
||||
static int armv8pmu_user_event_idx(struct perf_event *event)
|
||||
{
|
||||
if (!sysctl_perf_user_access || !armv8pmu_event_has_user_read(event))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* We remap the cycle counter index to 32 to
|
||||
* match the offset applied to the rest of
|
||||
* the counter indices.
|
||||
*/
|
||||
if (event->hw.idx == ARMV8_IDX_CYCLE_COUNTER)
|
||||
return ARMV8_IDX_CYCLE_COUNTER_USER;
|
||||
|
||||
return event->hw.idx;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add an event filter to a given event.
|
||||
*/
|
||||
@ -996,6 +1058,25 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
|
||||
if (armv8pmu_event_is_64bit(event))
|
||||
event->hw.flags |= ARMPMU_EVT_64BIT;
|
||||
|
||||
/*
|
||||
* User events must be allocated into a single counter, and so
|
||||
* must not be chained.
|
||||
*
|
||||
* Most 64-bit events require long counter support, but 64-bit
|
||||
* CPU_CYCLES events can be placed into the dedicated cycle
|
||||
* counter when this is free.
|
||||
*/
|
||||
if (armv8pmu_event_want_user_access(event)) {
|
||||
if (!(event->attach_state & PERF_ATTACH_TASK))
|
||||
return -EINVAL;
|
||||
if (armv8pmu_event_is_64bit(event) &&
|
||||
(hw_event_id != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
|
||||
!armv8pmu_has_long_event(armpmu))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
|
||||
}
|
||||
|
||||
/* Only expose micro/arch events supported by this PMU */
|
||||
if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS)
|
||||
&& test_bit(hw_event_id, armpmu->pmceid_bitmap)) {
|
||||
@ -1104,6 +1185,35 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
|
||||
return probe.present ? 0 : -ENODEV;
|
||||
}
|
||||
|
||||
static void armv8pmu_disable_user_access_ipi(void *unused)
|
||||
{
|
||||
armv8pmu_disable_user_access();
|
||||
}
|
||||
|
||||
static int armv8pmu_proc_user_access_handler(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
if (ret || !write || sysctl_perf_user_access)
|
||||
return ret;
|
||||
|
||||
on_each_cpu(armv8pmu_disable_user_access_ipi, NULL, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct ctl_table armv8_pmu_sysctl_table[] = {
|
||||
{
|
||||
.procname = "perf_user_access",
|
||||
.data = &sysctl_perf_user_access,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = armv8pmu_proc_user_access_handler,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
|
||||
int (*map_event)(struct perf_event *event),
|
||||
const struct attribute_group *events,
|
||||
@ -1127,6 +1237,8 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
|
||||
cpu_pmu->set_event_filter = armv8pmu_set_event_filter;
|
||||
cpu_pmu->filter_match = armv8pmu_filter_match;
|
||||
|
||||
cpu_pmu->pmu.event_idx = armv8pmu_user_event_idx;
|
||||
|
||||
cpu_pmu->name = name;
|
||||
cpu_pmu->map_event = map_event;
|
||||
cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = events ?
|
||||
@ -1136,6 +1248,8 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
|
||||
cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ?
|
||||
caps : &armv8_pmuv3_caps_attr_group;
|
||||
|
||||
register_sysctl("kernel", armv8_pmu_sysctl_table);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1301,6 +1415,14 @@ void arch_perf_update_userpage(struct perf_event *event,
|
||||
userpg->cap_user_time = 0;
|
||||
userpg->cap_user_time_zero = 0;
|
||||
userpg->cap_user_time_short = 0;
|
||||
userpg->cap_user_rdpmc = armv8pmu_event_has_user_read(event);
|
||||
|
||||
if (userpg->cap_user_rdpmc) {
|
||||
if (event->hw.flags & ARMPMU_EVT_64BIT)
|
||||
userpg->pmc_width = 64;
|
||||
else
|
||||
userpg->pmc_width = 32;
|
||||
}
|
||||
|
||||
do {
|
||||
rd = sched_clock_read_begin(&seq);
|
||||
|
@ -2476,7 +2476,7 @@ static int x86_pmu_event_init(struct perf_event *event)
|
||||
|
||||
if (READ_ONCE(x86_pmu.attr_rdpmc) &&
|
||||
!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
|
||||
event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
|
||||
event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -2510,7 +2510,7 @@ void perf_clear_dirty_counters(void)
|
||||
|
||||
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
|
||||
{
|
||||
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
|
||||
if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -2531,7 +2531,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
|
||||
|
||||
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
|
||||
{
|
||||
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
|
||||
if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
|
||||
return;
|
||||
|
||||
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
|
||||
@ -2542,7 +2542,7 @@ static int x86_pmu_event_idx(struct perf_event *event)
|
||||
{
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
|
||||
if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
|
||||
if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
|
||||
return 0;
|
||||
|
||||
if (is_metric_idx(hwc->idx))
|
||||
@ -2725,7 +2725,7 @@ void arch_perf_update_userpage(struct perf_event *event,
|
||||
userpg->cap_user_time = 0;
|
||||
userpg->cap_user_time_zero = 0;
|
||||
userpg->cap_user_rdpmc =
|
||||
!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
|
||||
!!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
|
||||
userpg->pmc_width = x86_pmu.cntval_bits;
|
||||
|
||||
if (!using_native_sched_clock() || !sched_clock_stable())
|
||||
|
@ -74,7 +74,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
|
||||
#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */
|
||||
#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */
|
||||
#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */
|
||||
#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */
|
||||
|
||||
#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */
|
||||
#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */
|
||||
#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */
|
||||
|
@ -129,6 +129,15 @@ struct hw_perf_event_extra {
|
||||
int idx; /* index in shared_regs->regs[] */
|
||||
};
|
||||
|
||||
/**
|
||||
* hw_perf_event::flag values
|
||||
*
|
||||
* PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
|
||||
* usage.
|
||||
*/
|
||||
#define PERF_EVENT_FLAG_ARCH 0x0000ffff
|
||||
#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000
|
||||
|
||||
/**
|
||||
* struct hw_perf_event - performance event hardware details:
|
||||
*/
|
||||
@ -822,6 +831,7 @@ struct perf_event_context {
|
||||
|
||||
int nr_events;
|
||||
int nr_active;
|
||||
int nr_user;
|
||||
int is_active;
|
||||
int nr_stat;
|
||||
int nr_freq;
|
||||
|
@ -1808,6 +1808,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
|
||||
|
||||
list_add_rcu(&event->event_entry, &ctx->event_list);
|
||||
ctx->nr_events++;
|
||||
if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
|
||||
ctx->nr_user++;
|
||||
if (event->attr.inherit_stat)
|
||||
ctx->nr_stat++;
|
||||
|
||||
@ -1999,6 +2001,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
|
||||
event->attach_state &= ~PERF_ATTACH_CONTEXT;
|
||||
|
||||
ctx->nr_events--;
|
||||
if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
|
||||
ctx->nr_user--;
|
||||
if (event->attr.inherit_stat)
|
||||
ctx->nr_stat--;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user