forked from Minki/linux
perf/core improvements and fixes:
User visible: - Tooling support for TopDown counters, recently added to the kernel (Andi Kleen) - Show call graphs in 'perf script' when 1st event doesn't have it but some other has (He Kuang) - Fix terminal cleanup when handling invalid .perfconfig files in 'perf top' (Taeung Song) Build fixes: - Respect CROSS_COMPILE for the linker in libapi (Lucas Stach) Infrastructure: - Fix perf_evlist__alloc_mmap() failure path (Wang Nan) - Provide way to extract integer value from format_field (Arnaldo Carvalho de Melo) Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> -----BEGIN PGP SIGNATURE----- Version: GnuPG v2 iQIcBAABCAAGBQJXVeP9AAoJENZQFvNTUqpAe8oP/10gbODr38JjOwrgARoux7so CVLpaxI21tpfEd3RTSPQSHx7wbph1luWy/GI2KDu39futg5FRDKNzVK7Ciy5VUx6 dW2BKhDjQb97G6X62Zp8rsw7jwNN775ot8E7XFzEMl6bHVRkU0bO4cBM8rMh65NW dP4UMhE65hEkInxPpr87SYyxgKoWAYao+ZXFyIXzqApUZJ2Enqp7uNwDt5x4zedh 3d9y7EUQpaDmXcdtcSFfsZCa5B+3/5HKYWksX6ofDcwvTZUTtjEslAPH9Xt+dmYn x+HoMvM3ZX33UeYfTiPYjEAmOUdis8ZYvbSf0XdtpYTNUsiw+YgUcBbWsYOZrdvk N+dCRoVlaU4BBn1UpPWu1o0BC/uBgVjkWbXsvVSZC73cJXOoB1PbMzHTLvX22KxA gcGOG5EpgKKf+vz3NwwRNRhmjIh97ChA/OOP8TYIBEWFheOtBvnHJvRawG4vDR09 4Ywg2uecbZfN+UI3qv7kis9TvV03RWgpCCWMClAmrm3RGqNFfGQGIEoRucwcUGK7 TzfvU2ycetw6hizoLgXR8gyYhhkEiZ5rhogvx+5SxVh+g9tSxLrWZxhU6jDH5zRp vAJ1hUFkafOC60qGEkfGVnFdo1zLWhhGmN1W5mzSsYfWZCVuhDduD3i10UfX4dip 3gNEajV6R96WEEuGeIbs =60TB -----END PGP SIGNATURE----- Merge tag 'perf-core-for-mingo-20160606' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo: User visible changes: - Tooling support for TopDown counters, recently added to the kernel (Andi Kleen) - Show call graphs in 'perf script' when 1st event doesn't have it but some other has (He Kuang) - Fix terminal cleanup when handling invalid .perfconfig files in 'perf top' (Taeung Song) Build fixes: - Respect CROSS_COMPILE for the linker in libapi (Lucas Stach) Infrastructure changes: - Fix perf_evlist__alloc_mmap() failure path (Wang Nan) - Provide way to extract integer value from format_field (Arnaldo Carvalho de Melo) Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
aa3a655b15
@ -10,6 +10,7 @@ endif
|
||||
|
||||
CC = $(CROSS_COMPILE)gcc
|
||||
AR = $(CROSS_COMPILE)ar
|
||||
LD = $(CROSS_COMPILE)ld
|
||||
|
||||
MAKEFLAGS += --no-print-directory
|
||||
|
||||
|
@ -204,6 +204,38 @@ Aggregate counts per physical processor for system-wide mode measurements.
|
||||
--no-aggr::
|
||||
Do not aggregate counts across all monitored CPUs.
|
||||
|
||||
--topdown::
|
||||
Print top down level 1 metrics if supported by the CPU. This allows to
|
||||
determine bottle necks in the CPU pipeline for CPU bound workloads,
|
||||
by breaking the cycles consumed down into frontend bound, backend bound,
|
||||
bad speculation and retiring.
|
||||
|
||||
Frontend bound means that the CPU cannot fetch and decode instructions fast
|
||||
enough. Backend bound means that computation or memory access is the bottle
|
||||
neck. Bad Speculation means that the CPU wasted cycles due to branch
|
||||
mispredictions and similar issues. Retiring means that the CPU computed without
|
||||
an apparently bottleneck. The bottleneck is only the real bottleneck
|
||||
if the workload is actually bound by the CPU and not by something else.
|
||||
|
||||
For best results it is usually a good idea to use it with interval
|
||||
mode like -I 1000, as the bottleneck of workloads can change often.
|
||||
|
||||
The top down metrics are collected per core instead of per
|
||||
CPU thread. Per core mode is automatically enabled
|
||||
and -a (global monitoring) is needed, requiring root rights or
|
||||
perf.perf_event_paranoid=-1.
|
||||
|
||||
Topdown uses the full Performance Monitoring Unit, and needs
|
||||
disabling of the NMI watchdog (as root):
|
||||
echo 0 > /proc/sys/kernel/nmi_watchdog
|
||||
for best results. Otherwise the bottlenecks may be inconsistent
|
||||
on workload with changing phases.
|
||||
|
||||
This enables --metric-only, unless overriden with --no-metric-only.
|
||||
|
||||
To interpret the results it is usually needed to know on which
|
||||
CPUs the workload runs on. If needed the CPUs can be forced using
|
||||
taskset.
|
||||
|
||||
EXAMPLES
|
||||
--------
|
||||
|
@ -3,6 +3,7 @@ libperf-y += tsc.o
|
||||
libperf-y += pmu.o
|
||||
libperf-y += kvm-stat.o
|
||||
libperf-y += perf_regs.o
|
||||
libperf-y += group.o
|
||||
|
||||
libperf-$(CONFIG_DWARF) += dwarf-regs.o
|
||||
libperf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
|
||||
|
27
tools/perf/arch/x86/util/group.c
Normal file
27
tools/perf/arch/x86/util/group.c
Normal file
@ -0,0 +1,27 @@
|
||||
#include <stdio.h>
|
||||
#include "api/fs/fs.h"
|
||||
#include "util/group.h"
|
||||
|
||||
/*
|
||||
* Check whether we can use a group for top down.
|
||||
* Without a group may get bad results due to multiplexing.
|
||||
*/
|
||||
bool arch_topdown_check_group(bool *warn)
|
||||
{
|
||||
int n;
|
||||
|
||||
if (sysctl__read_int("kernel/nmi_watchdog", &n) < 0)
|
||||
return false;
|
||||
if (n > 0) {
|
||||
*warn = true;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void arch_topdown_group_warn(void)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"nmi_watchdog enabled with topdown. May give wrong results.\n"
|
||||
"Disable with echo 0 > /proc/sys/kernel/nmi_watchdog\n");
|
||||
}
|
@ -339,7 +339,7 @@ static void set_print_ip_opts(struct perf_event_attr *attr)
|
||||
*/
|
||||
static int perf_session__check_output_opt(struct perf_session *session)
|
||||
{
|
||||
int j;
|
||||
unsigned int j;
|
||||
struct perf_evsel *evsel;
|
||||
|
||||
for (j = 0; j < PERF_TYPE_MAX; ++j) {
|
||||
@ -388,17 +388,20 @@ static int perf_session__check_output_opt(struct perf_session *session)
|
||||
struct perf_event_attr *attr;
|
||||
|
||||
j = PERF_TYPE_TRACEPOINT;
|
||||
evsel = perf_session__find_first_evtype(session, j);
|
||||
if (evsel == NULL)
|
||||
goto out;
|
||||
|
||||
attr = &evsel->attr;
|
||||
evlist__for_each(session->evlist, evsel) {
|
||||
if (evsel->attr.type != j)
|
||||
continue;
|
||||
|
||||
if (attr->sample_type & PERF_SAMPLE_CALLCHAIN) {
|
||||
output[j].fields |= PERF_OUTPUT_IP;
|
||||
output[j].fields |= PERF_OUTPUT_SYM;
|
||||
output[j].fields |= PERF_OUTPUT_DSO;
|
||||
set_print_ip_opts(attr);
|
||||
attr = &evsel->attr;
|
||||
|
||||
if (attr->sample_type & PERF_SAMPLE_CALLCHAIN) {
|
||||
output[j].fields |= PERF_OUTPUT_IP;
|
||||
output[j].fields |= PERF_OUTPUT_SYM;
|
||||
output[j].fields |= PERF_OUTPUT_DSO;
|
||||
set_print_ip_opts(attr);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -59,10 +59,13 @@
|
||||
#include "util/thread.h"
|
||||
#include "util/thread_map.h"
|
||||
#include "util/counts.h"
|
||||
#include "util/group.h"
|
||||
#include "util/session.h"
|
||||
#include "util/tool.h"
|
||||
#include "util/group.h"
|
||||
#include "asm/bug.h"
|
||||
|
||||
#include <api/fs/fs.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <locale.h>
|
||||
@ -98,6 +101,15 @@ static const char * transaction_limited_attrs = {
|
||||
"}"
|
||||
};
|
||||
|
||||
static const char * topdown_attrs[] = {
|
||||
"topdown-total-slots",
|
||||
"topdown-slots-retired",
|
||||
"topdown-recovery-bubbles",
|
||||
"topdown-fetch-bubbles",
|
||||
"topdown-slots-issued",
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct perf_evlist *evsel_list;
|
||||
|
||||
static struct target target = {
|
||||
@ -112,6 +124,7 @@ static volatile pid_t child_pid = -1;
|
||||
static bool null_run = false;
|
||||
static int detailed_run = 0;
|
||||
static bool transaction_run;
|
||||
static bool topdown_run = false;
|
||||
static bool big_num = true;
|
||||
static int big_num_opt = -1;
|
||||
static const char *csv_sep = NULL;
|
||||
@ -124,6 +137,7 @@ static unsigned int initial_delay = 0;
|
||||
static unsigned int unit_width = 4; /* strlen("unit") */
|
||||
static bool forever = false;
|
||||
static bool metric_only = false;
|
||||
static bool force_metric_only = false;
|
||||
static struct timespec ref_time;
|
||||
static struct cpu_map *aggr_map;
|
||||
static aggr_get_id_t aggr_get_id;
|
||||
@ -1302,7 +1316,15 @@ static int aggr_header_lens[] = {
|
||||
[AGGR_GLOBAL] = 0,
|
||||
};
|
||||
|
||||
static void print_metric_headers(char *prefix)
|
||||
static const char *aggr_header_csv[] = {
|
||||
[AGGR_CORE] = "core,cpus,",
|
||||
[AGGR_SOCKET] = "socket,cpus",
|
||||
[AGGR_NONE] = "cpu,",
|
||||
[AGGR_THREAD] = "comm-pid,",
|
||||
[AGGR_GLOBAL] = ""
|
||||
};
|
||||
|
||||
static void print_metric_headers(const char *prefix, bool no_indent)
|
||||
{
|
||||
struct perf_stat_output_ctx out;
|
||||
struct perf_evsel *counter;
|
||||
@ -1313,9 +1335,15 @@ static void print_metric_headers(char *prefix)
|
||||
if (prefix)
|
||||
fprintf(stat_config.output, "%s", prefix);
|
||||
|
||||
if (!csv_output)
|
||||
if (!csv_output && !no_indent)
|
||||
fprintf(stat_config.output, "%*s",
|
||||
aggr_header_lens[stat_config.aggr_mode], "");
|
||||
if (csv_output) {
|
||||
if (stat_config.interval)
|
||||
fputs("time,", stat_config.output);
|
||||
fputs(aggr_header_csv[stat_config.aggr_mode],
|
||||
stat_config.output);
|
||||
}
|
||||
|
||||
/* Print metrics headers only */
|
||||
evlist__for_each(evsel_list, counter) {
|
||||
@ -1338,28 +1366,40 @@ static void print_interval(char *prefix, struct timespec *ts)
|
||||
|
||||
sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
|
||||
|
||||
if (num_print_interval == 0 && !csv_output && !metric_only) {
|
||||
if (num_print_interval == 0 && !csv_output) {
|
||||
switch (stat_config.aggr_mode) {
|
||||
case AGGR_SOCKET:
|
||||
fprintf(output, "# time socket cpus counts %*s events\n", unit_width, "unit");
|
||||
fprintf(output, "# time socket cpus");
|
||||
if (!metric_only)
|
||||
fprintf(output, " counts %*s events\n", unit_width, "unit");
|
||||
break;
|
||||
case AGGR_CORE:
|
||||
fprintf(output, "# time core cpus counts %*s events\n", unit_width, "unit");
|
||||
fprintf(output, "# time core cpus");
|
||||
if (!metric_only)
|
||||
fprintf(output, " counts %*s events\n", unit_width, "unit");
|
||||
break;
|
||||
case AGGR_NONE:
|
||||
fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit");
|
||||
fprintf(output, "# time CPU");
|
||||
if (!metric_only)
|
||||
fprintf(output, " counts %*s events\n", unit_width, "unit");
|
||||
break;
|
||||
case AGGR_THREAD:
|
||||
fprintf(output, "# time comm-pid counts %*s events\n", unit_width, "unit");
|
||||
fprintf(output, "# time comm-pid");
|
||||
if (!metric_only)
|
||||
fprintf(output, " counts %*s events\n", unit_width, "unit");
|
||||
break;
|
||||
case AGGR_GLOBAL:
|
||||
default:
|
||||
fprintf(output, "# time counts %*s events\n", unit_width, "unit");
|
||||
fprintf(output, "# time");
|
||||
if (!metric_only)
|
||||
fprintf(output, " counts %*s events\n", unit_width, "unit");
|
||||
case AGGR_UNSET:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (num_print_interval == 0 && metric_only)
|
||||
print_metric_headers(" ", true);
|
||||
if (++num_print_interval == 25)
|
||||
num_print_interval = 0;
|
||||
}
|
||||
@ -1428,8 +1468,8 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
|
||||
if (metric_only) {
|
||||
static int num_print_iv;
|
||||
|
||||
if (num_print_iv == 0)
|
||||
print_metric_headers(prefix);
|
||||
if (num_print_iv == 0 && !interval)
|
||||
print_metric_headers(prefix, false);
|
||||
if (num_print_iv++ == 25)
|
||||
num_print_iv = 0;
|
||||
if (stat_config.aggr_mode == AGGR_GLOBAL && prefix)
|
||||
@ -1520,6 +1560,14 @@ static int stat__set_big_num(const struct option *opt __maybe_unused,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int enable_metric_only(const struct option *opt __maybe_unused,
|
||||
const char *s __maybe_unused, int unset)
|
||||
{
|
||||
force_metric_only = true;
|
||||
metric_only = !unset;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct option stat_options[] = {
|
||||
OPT_BOOLEAN('T', "transaction", &transaction_run,
|
||||
"hardware transaction statistics"),
|
||||
@ -1578,8 +1626,10 @@ static const struct option stat_options[] = {
|
||||
"aggregate counts per thread", AGGR_THREAD),
|
||||
OPT_UINTEGER('D', "delay", &initial_delay,
|
||||
"ms to wait before starting measurement after program start"),
|
||||
OPT_BOOLEAN(0, "metric-only", &metric_only,
|
||||
"Only print computed metrics. No raw values"),
|
||||
OPT_CALLBACK_NOOPT(0, "metric-only", &metric_only, NULL,
|
||||
"Only print computed metrics. No raw values", enable_metric_only),
|
||||
OPT_BOOLEAN(0, "topdown", &topdown_run,
|
||||
"measure topdown level 1 statistics"),
|
||||
OPT_END()
|
||||
};
|
||||
|
||||
@ -1772,12 +1822,62 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int topdown_filter_events(const char **attr, char **str, bool use_group)
|
||||
{
|
||||
int off = 0;
|
||||
int i;
|
||||
int len = 0;
|
||||
char *s;
|
||||
|
||||
for (i = 0; attr[i]; i++) {
|
||||
if (pmu_have_event("cpu", attr[i])) {
|
||||
len += strlen(attr[i]) + 1;
|
||||
attr[i - off] = attr[i];
|
||||
} else
|
||||
off++;
|
||||
}
|
||||
attr[i - off] = NULL;
|
||||
|
||||
*str = malloc(len + 1 + 2);
|
||||
if (!*str)
|
||||
return -1;
|
||||
s = *str;
|
||||
if (i - off == 0) {
|
||||
*s = 0;
|
||||
return 0;
|
||||
}
|
||||
if (use_group)
|
||||
*s++ = '{';
|
||||
for (i = 0; attr[i]; i++) {
|
||||
strcpy(s, attr[i]);
|
||||
s += strlen(s);
|
||||
*s++ = ',';
|
||||
}
|
||||
if (use_group) {
|
||||
s[-1] = '}';
|
||||
*s = 0;
|
||||
} else
|
||||
s[-1] = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
__weak bool arch_topdown_check_group(bool *warn)
|
||||
{
|
||||
*warn = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
__weak void arch_topdown_group_warn(void)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Add default attributes, if there were no attributes specified or
|
||||
* if -d/--detailed, -d -d or -d -d -d is used:
|
||||
*/
|
||||
static int add_default_attributes(void)
|
||||
{
|
||||
int err;
|
||||
struct perf_event_attr default_attrs0[] = {
|
||||
|
||||
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
|
||||
@ -1896,7 +1996,6 @@ static int add_default_attributes(void)
|
||||
return 0;
|
||||
|
||||
if (transaction_run) {
|
||||
int err;
|
||||
if (pmu_have_event("cpu", "cycles-ct") &&
|
||||
pmu_have_event("cpu", "el-start"))
|
||||
err = parse_events(evsel_list, transaction_attrs, NULL);
|
||||
@ -1909,6 +2008,46 @@ static int add_default_attributes(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (topdown_run) {
|
||||
char *str = NULL;
|
||||
bool warn = false;
|
||||
|
||||
if (stat_config.aggr_mode != AGGR_GLOBAL &&
|
||||
stat_config.aggr_mode != AGGR_CORE) {
|
||||
pr_err("top down event configuration requires --per-core mode\n");
|
||||
return -1;
|
||||
}
|
||||
stat_config.aggr_mode = AGGR_CORE;
|
||||
if (nr_cgroups || !target__has_cpu(&target)) {
|
||||
pr_err("top down event configuration requires system-wide mode (-a)\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!force_metric_only)
|
||||
metric_only = true;
|
||||
if (topdown_filter_events(topdown_attrs, &str,
|
||||
arch_topdown_check_group(&warn)) < 0) {
|
||||
pr_err("Out of memory\n");
|
||||
return -1;
|
||||
}
|
||||
if (topdown_attrs[0] && str) {
|
||||
if (warn)
|
||||
arch_topdown_group_warn();
|
||||
err = parse_events(evsel_list, str, NULL);
|
||||
if (err) {
|
||||
fprintf(stderr,
|
||||
"Cannot set up top down events %s: %d\n",
|
||||
str, err);
|
||||
free(str);
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "System does not support topdown\n");
|
||||
return -1;
|
||||
}
|
||||
free(str);
|
||||
}
|
||||
|
||||
if (!evsel_list->nr_entries) {
|
||||
if (target__has_cpu(&target))
|
||||
default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK;
|
||||
|
@ -1783,8 +1783,8 @@ static int test_pmu_events(void)
|
||||
struct evlist_test e;
|
||||
char name[MAX_NAME];
|
||||
|
||||
if (!strcmp(ent->d_name, ".") ||
|
||||
!strcmp(ent->d_name, ".."))
|
||||
/* Names containing . are special and cannot be used directly */
|
||||
if (strchr(ent->d_name, '.'))
|
||||
continue;
|
||||
|
||||
snprintf(name, MAX_NAME, "cpu/event=%s/u", ent->d_name);
|
||||
|
@ -275,7 +275,8 @@ static int perf_parse_file(config_fn_t fn, void *data)
|
||||
break;
|
||||
}
|
||||
}
|
||||
die("bad config file line %d in %s", config_linenr, config_file_name);
|
||||
pr_err("bad config file line %d in %s\n", config_linenr, config_file_name);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int parse_unit_factor(const char *end, unsigned long *val)
|
||||
@ -479,16 +480,15 @@ static int perf_config_global(void)
|
||||
|
||||
int perf_config(config_fn_t fn, void *data)
|
||||
{
|
||||
int ret = 0, found = 0;
|
||||
int ret = -1;
|
||||
const char *home = NULL;
|
||||
|
||||
/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
|
||||
if (config_exclusive_filename)
|
||||
return perf_config_from_file(fn, config_exclusive_filename, data);
|
||||
if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
|
||||
ret += perf_config_from_file(fn, perf_etc_perfconfig(),
|
||||
data);
|
||||
found += 1;
|
||||
if (perf_config_from_file(fn, perf_etc_perfconfig(), data) < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
home = getenv("HOME");
|
||||
@ -514,14 +514,12 @@ int perf_config(config_fn_t fn, void *data)
|
||||
if (!st.st_size)
|
||||
goto out_free;
|
||||
|
||||
ret += perf_config_from_file(fn, user_config, data);
|
||||
found += 1;
|
||||
ret = perf_config_from_file(fn, user_config, data);
|
||||
|
||||
out_free:
|
||||
free(user_config);
|
||||
}
|
||||
out:
|
||||
if (found == 0)
|
||||
return -1;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -609,8 +607,12 @@ static int collect_config(const char *var, const char *value,
|
||||
struct perf_config_section *section = NULL;
|
||||
struct perf_config_item *item = NULL;
|
||||
struct perf_config_set *set = perf_config_set;
|
||||
struct list_head *sections = &set->sections;
|
||||
struct list_head *sections;
|
||||
|
||||
if (set == NULL)
|
||||
return -1;
|
||||
|
||||
sections = &set->sections;
|
||||
key = ptr = strdup(var);
|
||||
if (!key) {
|
||||
pr_debug("%s: strdup failed\n", __func__);
|
||||
|
@ -946,9 +946,12 @@ static int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
|
||||
if (cpu_map__empty(evlist->cpus))
|
||||
evlist->nr_mmaps = thread_map__nr(evlist->threads);
|
||||
evlist->mmap = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap));
|
||||
if (!evlist->mmap)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < evlist->nr_mmaps; i++)
|
||||
evlist->mmap[i].fd = -1;
|
||||
return evlist->mmap != NULL ? 0 : -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct mmap_params {
|
||||
|
@ -2251,17 +2251,11 @@ void *perf_evsel__rawptr(struct perf_evsel *evsel, struct perf_sample *sample,
|
||||
return sample->raw_data + offset;
|
||||
}
|
||||
|
||||
u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
|
||||
const char *name)
|
||||
u64 format_field__intval(struct format_field *field, struct perf_sample *sample,
|
||||
bool needs_swap)
|
||||
{
|
||||
struct format_field *field = perf_evsel__field(evsel, name);
|
||||
void *ptr;
|
||||
u64 value;
|
||||
|
||||
if (!field)
|
||||
return 0;
|
||||
|
||||
ptr = sample->raw_data + field->offset;
|
||||
void *ptr = sample->raw_data + field->offset;
|
||||
|
||||
switch (field->size) {
|
||||
case 1:
|
||||
@ -2279,7 +2273,7 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!evsel->needs_swap)
|
||||
if (!needs_swap)
|
||||
return value;
|
||||
|
||||
switch (field->size) {
|
||||
@ -2296,6 +2290,17 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
|
||||
return 0;
|
||||
}
|
||||
|
||||
u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
|
||||
const char *name)
|
||||
{
|
||||
struct format_field *field = perf_evsel__field(evsel, name);
|
||||
|
||||
if (!field)
|
||||
return 0;
|
||||
|
||||
return field ? format_field__intval(field, sample, evsel->needs_swap) : 0;
|
||||
}
|
||||
|
||||
bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
|
||||
char *msg, size_t msgsize)
|
||||
{
|
||||
|
@ -261,6 +261,8 @@ static inline char *perf_evsel__strval(struct perf_evsel *evsel,
|
||||
|
||||
struct format_field;
|
||||
|
||||
u64 format_field__intval(struct format_field *field, struct perf_sample *sample, bool needs_swap);
|
||||
|
||||
struct format_field *perf_evsel__field(struct perf_evsel *evsel, const char *name);
|
||||
|
||||
#define perf_evsel__match(evsel, t, c) \
|
||||
|
7
tools/perf/util/group.h
Normal file
7
tools/perf/util/group.h
Normal file
@ -0,0 +1,7 @@
|
||||
#ifndef GROUP_H
|
||||
#define GROUP_H 1
|
||||
|
||||
bool arch_topdown_check_group(bool *warn);
|
||||
void arch_topdown_group_warn(void);
|
||||
|
||||
#endif
|
@ -260,6 +260,7 @@ cycles-ct { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
|
||||
cycles-t { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
|
||||
mem-loads { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
|
||||
mem-stores { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
|
||||
topdown-[a-z-]+ { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
|
||||
|
||||
L1-dcache|l1-d|l1d|L1-data |
|
||||
L1-icache|l1-i|l1i|L1-instruction |
|
||||
|
@ -36,6 +36,11 @@ static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
|
||||
static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
|
||||
static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
|
||||
static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
|
||||
static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS];
|
||||
static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
|
||||
static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
|
||||
static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
|
||||
static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
|
||||
static bool have_frontend_stalled;
|
||||
|
||||
struct stats walltime_nsecs_stats;
|
||||
@ -82,6 +87,11 @@ void perf_stat__reset_shadow_stats(void)
|
||||
sizeof(runtime_transaction_stats));
|
||||
memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
|
||||
memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
|
||||
memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots));
|
||||
memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired));
|
||||
memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
|
||||
memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
|
||||
memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -105,6 +115,16 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
|
||||
update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
|
||||
else if (perf_stat_evsel__is(counter, ELISION_START))
|
||||
update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
|
||||
else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
|
||||
update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]);
|
||||
else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
|
||||
update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]);
|
||||
else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
|
||||
update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]);
|
||||
else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
|
||||
update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]);
|
||||
else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
|
||||
update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]);
|
||||
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
|
||||
update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
|
||||
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
|
||||
@ -302,6 +322,107 @@ static void print_ll_cache_misses(int cpu,
|
||||
out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
|
||||
}
|
||||
|
||||
/*
|
||||
* High level "TopDown" CPU core pipe line bottleneck break down.
|
||||
*
|
||||
* Basic concept following
|
||||
* Yasin, A Top Down Method for Performance analysis and Counter architecture
|
||||
* ISPASS14
|
||||
*
|
||||
* The CPU pipeline is divided into 4 areas that can be bottlenecks:
|
||||
*
|
||||
* Frontend -> Backend -> Retiring
|
||||
* BadSpeculation in addition means out of order execution that is thrown away
|
||||
* (for example branch mispredictions)
|
||||
* Frontend is instruction decoding.
|
||||
* Backend is execution, like computation and accessing data in memory
|
||||
* Retiring is good execution that is not directly bottlenecked
|
||||
*
|
||||
* The formulas are computed in slots.
|
||||
* A slot is an entry in the pipeline each for the pipeline width
|
||||
* (for example a 4-wide pipeline has 4 slots for each cycle)
|
||||
*
|
||||
* Formulas:
|
||||
* BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) /
|
||||
* TotalSlots
|
||||
* Retiring = SlotsRetired / TotalSlots
|
||||
* FrontendBound = FetchBubbles / TotalSlots
|
||||
* BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound
|
||||
*
|
||||
* The kernel provides the mapping to the low level CPU events and any scaling
|
||||
* needed for the CPU pipeline width, for example:
|
||||
*
|
||||
* TotalSlots = Cycles * 4
|
||||
*
|
||||
* The scaling factor is communicated in the sysfs unit.
|
||||
*
|
||||
* In some cases the CPU may not be able to measure all the formulas due to
|
||||
* missing events. In this case multiple formulas are combined, as possible.
|
||||
*
|
||||
* Full TopDown supports more levels to sub-divide each area: for example
|
||||
* BackendBound into computing bound and memory bound. For now we only
|
||||
* support Level 1 TopDown.
|
||||
*/
|
||||
|
||||
static double sanitize_val(double x)
|
||||
{
|
||||
if (x < 0 && x >= -0.02)
|
||||
return 0.0;
|
||||
return x;
|
||||
}
|
||||
|
||||
static double td_total_slots(int ctx, int cpu)
|
||||
{
|
||||
return avg_stats(&runtime_topdown_total_slots[ctx][cpu]);
|
||||
}
|
||||
|
||||
static double td_bad_spec(int ctx, int cpu)
|
||||
{
|
||||
double bad_spec = 0;
|
||||
double total_slots;
|
||||
double total;
|
||||
|
||||
total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) -
|
||||
avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) +
|
||||
avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]);
|
||||
total_slots = td_total_slots(ctx, cpu);
|
||||
if (total_slots)
|
||||
bad_spec = total / total_slots;
|
||||
return sanitize_val(bad_spec);
|
||||
}
|
||||
|
||||
static double td_retiring(int ctx, int cpu)
|
||||
{
|
||||
double retiring = 0;
|
||||
double total_slots = td_total_slots(ctx, cpu);
|
||||
double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]);
|
||||
|
||||
if (total_slots)
|
||||
retiring = ret_slots / total_slots;
|
||||
return retiring;
|
||||
}
|
||||
|
||||
static double td_fe_bound(int ctx, int cpu)
|
||||
{
|
||||
double fe_bound = 0;
|
||||
double total_slots = td_total_slots(ctx, cpu);
|
||||
double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]);
|
||||
|
||||
if (total_slots)
|
||||
fe_bound = fetch_bub / total_slots;
|
||||
return fe_bound;
|
||||
}
|
||||
|
||||
static double td_be_bound(int ctx, int cpu)
|
||||
{
|
||||
double sum = (td_fe_bound(ctx, cpu) +
|
||||
td_bad_spec(ctx, cpu) +
|
||||
td_retiring(ctx, cpu));
|
||||
if (sum == 0)
|
||||
return 0;
|
||||
return sanitize_val(1.0 - sum);
|
||||
}
|
||||
|
||||
void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
|
||||
double avg, int cpu,
|
||||
struct perf_stat_output_ctx *out)
|
||||
@ -309,6 +430,7 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
|
||||
void *ctxp = out->ctx;
|
||||
print_metric_t print_metric = out->print_metric;
|
||||
double total, ratio = 0.0, total2;
|
||||
const char *color = NULL;
|
||||
int ctx = evsel_context(evsel);
|
||||
|
||||
if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
|
||||
@ -452,6 +574,46 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
|
||||
avg / ratio);
|
||||
else
|
||||
print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
|
||||
} else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
|
||||
double fe_bound = td_fe_bound(ctx, cpu);
|
||||
|
||||
if (fe_bound > 0.2)
|
||||
color = PERF_COLOR_RED;
|
||||
print_metric(ctxp, color, "%8.1f%%", "frontend bound",
|
||||
fe_bound * 100.);
|
||||
} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
|
||||
double retiring = td_retiring(ctx, cpu);
|
||||
|
||||
if (retiring > 0.7)
|
||||
color = PERF_COLOR_GREEN;
|
||||
print_metric(ctxp, color, "%8.1f%%", "retiring",
|
||||
retiring * 100.);
|
||||
} else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
|
||||
double bad_spec = td_bad_spec(ctx, cpu);
|
||||
|
||||
if (bad_spec > 0.1)
|
||||
color = PERF_COLOR_RED;
|
||||
print_metric(ctxp, color, "%8.1f%%", "bad speculation",
|
||||
bad_spec * 100.);
|
||||
} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
|
||||
double be_bound = td_be_bound(ctx, cpu);
|
||||
const char *name = "backend bound";
|
||||
static int have_recovery_bubbles = -1;
|
||||
|
||||
/* In case the CPU does not support topdown-recovery-bubbles */
|
||||
if (have_recovery_bubbles < 0)
|
||||
have_recovery_bubbles = pmu_have_event("cpu",
|
||||
"topdown-recovery-bubbles");
|
||||
if (!have_recovery_bubbles)
|
||||
name = "backend bound/bad spec";
|
||||
|
||||
if (be_bound > 0.2)
|
||||
color = PERF_COLOR_RED;
|
||||
if (td_total_slots(ctx, cpu) > 0)
|
||||
print_metric(ctxp, color, "%8.1f%%", name,
|
||||
be_bound * 100.);
|
||||
else
|
||||
print_metric(ctxp, NULL, NULL, name, 0);
|
||||
} else if (runtime_nsecs_stats[cpu].n != 0) {
|
||||
char unit = 'M';
|
||||
char unit_buf[10];
|
||||
|
@ -79,6 +79,11 @@ static const char *id_str[PERF_STAT_EVSEL_ID__MAX] = {
|
||||
ID(TRANSACTION_START, cpu/tx-start/),
|
||||
ID(ELISION_START, cpu/el-start/),
|
||||
ID(CYCLES_IN_TX_CP, cpu/cycles-ct/),
|
||||
ID(TOPDOWN_TOTAL_SLOTS, topdown-total-slots),
|
||||
ID(TOPDOWN_SLOTS_ISSUED, topdown-slots-issued),
|
||||
ID(TOPDOWN_SLOTS_RETIRED, topdown-slots-retired),
|
||||
ID(TOPDOWN_FETCH_BUBBLES, topdown-fetch-bubbles),
|
||||
ID(TOPDOWN_RECOVERY_BUBBLES, topdown-recovery-bubbles),
|
||||
};
|
||||
#undef ID
|
||||
|
||||
|
@ -17,6 +17,11 @@ enum perf_stat_evsel_id {
|
||||
PERF_STAT_EVSEL_ID__TRANSACTION_START,
|
||||
PERF_STAT_EVSEL_ID__ELISION_START,
|
||||
PERF_STAT_EVSEL_ID__CYCLES_IN_TX_CP,
|
||||
PERF_STAT_EVSEL_ID__TOPDOWN_TOTAL_SLOTS,
|
||||
PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_ISSUED,
|
||||
PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_RETIRED,
|
||||
PERF_STAT_EVSEL_ID__TOPDOWN_FETCH_BUBBLES,
|
||||
PERF_STAT_EVSEL_ID__TOPDOWN_RECOVERY_BUBBLES,
|
||||
PERF_STAT_EVSEL_ID__MAX,
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user