mirror of
https://github.com/torvalds/linux.git
synced 2024-11-28 07:01:32 +00:00
perf/core improvements:
User visible: o Support handling complete branch stacks as histograms (Andi Kleen) Infrastructure: o Prep work for supporting per-pkg and snapshot counters in 'perf stat' (Jiri Olsa) Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQEcBAABAgAGBQJUfPO7AAoJEBpxZoYYoA71TvEIAKyJODdEj7oVjYMqehGZfLPU mR54xXWVOWaHWrsl/E7SiRudue7IdTnAzabQjq2VnxbDG7Y8oSJ8uYwHhAu9Sxtr WYGa+pAdnn5fwt/xm9Dqd0BuLI3DFB2t0aRvOJ9odPEBjHkLwgizFaACuKhWjSN7 LiwUTxVxQn8z+M/xgmH4jKuFH+QG7LHVqNHgjrPOX23J8AEDxGzXEIws8vymIrzZ pYAHtQ5s6yNU1JOS04gX8DFcGHOZnkcgVyIdD9f/QNMXSAZB/GJBtYrvprzl0i8O AiyTKYMrK1Il5nJkESsjunNkWfnEwJw95BEwlVPIKsgPW3wNaBoWfUpfkFK7T+k= =V1km -----END PGP SIGNATURE----- Merge tag 'perf-core-for-mingo-2' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core Pull perf/core improvements from Arnaldo Carvalho de Melo: User visible changes: - Support handling complete branch stacks as histograms (Andi Kleen) Infrastructure changes: - Prep work for supporting per-pkg and snapshot counters in 'perf stat' (Jiri Olsa) Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
cfa0bd52d0
@ -159,7 +159,7 @@ OPTIONS
|
||||
--dump-raw-trace::
|
||||
Dump raw trace in ASCII.
|
||||
|
||||
-g [type,min[,limit],order[,key]]::
|
||||
-g [type,min[,limit],order[,key][,branch]]::
|
||||
--call-graph::
|
||||
Display call chains using type, min percent threshold, optional print
|
||||
limit and order.
|
||||
@ -177,6 +177,11 @@ OPTIONS
|
||||
- function: compare on functions
|
||||
- address: compare on individual code addresses
|
||||
|
||||
branch can be:
|
||||
- branch: include last branch information in callgraph
|
||||
when available. Usually more convenient to use --branch-history
|
||||
for this.
|
||||
|
||||
Default: fractal,0.5,callee,function.
|
||||
|
||||
--children::
|
||||
@ -266,6 +271,11 @@ OPTIONS
|
||||
branch stacks and it will automatically switch to the branch view mode,
|
||||
unless --no-branch-stack is used.
|
||||
|
||||
--branch-history::
|
||||
Add the addresses of sampled taken branches to the callstack.
|
||||
This allows to examine the path the program took to each sample.
|
||||
The data collection must have used -b (or -j) and -g.
|
||||
|
||||
--objdump=<path>::
|
||||
Path to objdump binary.
|
||||
|
||||
|
@ -226,8 +226,9 @@ static int report__setup_sample_type(struct report *rep)
|
||||
return -EINVAL;
|
||||
}
|
||||
if (symbol_conf.use_callchain) {
|
||||
ui__error("Selected -g but no callchain data. Did "
|
||||
"you call 'perf record' without -g?\n");
|
||||
ui__error("Selected -g or --branch-history but no "
|
||||
"callchain data. Did\n"
|
||||
"you call 'perf record' without -g?\n");
|
||||
return -1;
|
||||
}
|
||||
} else if (!rep->dont_use_callchains &&
|
||||
@ -575,6 +576,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
|
||||
struct stat st;
|
||||
bool has_br_stack = false;
|
||||
int branch_mode = -1;
|
||||
bool branch_call_mode = false;
|
||||
char callchain_default_opt[] = "fractal,0.5,callee";
|
||||
const char * const report_usage[] = {
|
||||
"perf report [<options>]",
|
||||
@ -637,8 +639,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
|
||||
"regex filter to identify parent, see: '--sort parent'"),
|
||||
OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
|
||||
"Only display entries with parent-match"),
|
||||
OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
|
||||
"Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). "
|
||||
OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order[,branch]",
|
||||
"Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address), add branches. "
|
||||
"Default: fractal,0.5,callee,function", &report_parse_callchain_opt, callchain_default_opt),
|
||||
OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain,
|
||||
"Accumulate callchains of children and show total overhead as well"),
|
||||
@ -684,7 +686,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
|
||||
OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
|
||||
"Show event group information together"),
|
||||
OPT_CALLBACK_NOOPT('b', "branch-stack", &branch_mode, "",
|
||||
"use branch records for histogram filling", parse_branch_mode),
|
||||
"use branch records for per branch histogram filling",
|
||||
parse_branch_mode),
|
||||
OPT_BOOLEAN(0, "branch-history", &branch_call_mode,
|
||||
"add last branch records to call history"),
|
||||
OPT_STRING(0, "objdump", &objdump_path, "path",
|
||||
"objdump binary to use for disassembly and annotations"),
|
||||
OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
|
||||
@ -745,10 +750,24 @@ repeat:
|
||||
has_br_stack = perf_header__has_feat(&session->header,
|
||||
HEADER_BRANCH_STACK);
|
||||
|
||||
if ((branch_mode == -1 && has_br_stack) || branch_mode == 1) {
|
||||
/*
|
||||
* Branch mode is a tristate:
|
||||
* -1 means default, so decide based on the file having branch data.
|
||||
* 0/1 means the user chose a mode.
|
||||
*/
|
||||
if (((branch_mode == -1 && has_br_stack) || branch_mode == 1) &&
|
||||
branch_call_mode == -1) {
|
||||
sort__mode = SORT_MODE__BRANCH;
|
||||
symbol_conf.cumulate_callchain = false;
|
||||
}
|
||||
if (branch_call_mode) {
|
||||
callchain_param.key = CCKEY_ADDRESS;
|
||||
callchain_param.branch_callstack = 1;
|
||||
symbol_conf.use_callchain = true;
|
||||
callchain_register_param(&callchain_param);
|
||||
if (sort_order == NULL)
|
||||
sort_order = "srcline,symbol,dso";
|
||||
}
|
||||
|
||||
if (report.mem_mode) {
|
||||
if (sort__mode == SORT_MODE__BRANCH) {
|
||||
|
@ -388,20 +388,102 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
|
||||
update_stats(&runtime_itlb_cache_stats[0], count[0]);
|
||||
}
|
||||
|
||||
static void zero_per_pkg(struct perf_evsel *counter)
|
||||
{
|
||||
if (counter->per_pkg_mask)
|
||||
memset(counter->per_pkg_mask, 0, MAX_NR_CPUS);
|
||||
}
|
||||
|
||||
static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip)
|
||||
{
|
||||
unsigned long *mask = counter->per_pkg_mask;
|
||||
struct cpu_map *cpus = perf_evsel__cpus(counter);
|
||||
int s;
|
||||
|
||||
*skip = false;
|
||||
|
||||
if (!counter->per_pkg)
|
||||
return 0;
|
||||
|
||||
if (cpu_map__empty(cpus))
|
||||
return 0;
|
||||
|
||||
if (!mask) {
|
||||
mask = zalloc(MAX_NR_CPUS);
|
||||
if (!mask)
|
||||
return -ENOMEM;
|
||||
|
||||
counter->per_pkg_mask = mask;
|
||||
}
|
||||
|
||||
s = cpu_map__get_socket(cpus, cpu);
|
||||
if (s < 0)
|
||||
return -1;
|
||||
|
||||
*skip = test_and_set_bit(s, mask) == 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_cb(struct perf_evsel *evsel, int cpu, int thread __maybe_unused,
|
||||
struct perf_counts_values *count)
|
||||
{
|
||||
struct perf_counts_values *aggr = &evsel->counts->aggr;
|
||||
static struct perf_counts_values zero;
|
||||
bool skip = false;
|
||||
|
||||
if (check_per_pkg(evsel, cpu, &skip)) {
|
||||
pr_err("failed to read per-pkg counter\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (skip)
|
||||
count = &zero;
|
||||
|
||||
switch (aggr_mode) {
|
||||
case AGGR_CORE:
|
||||
case AGGR_SOCKET:
|
||||
case AGGR_NONE:
|
||||
if (!evsel->snapshot)
|
||||
perf_evsel__compute_deltas(evsel, cpu, count);
|
||||
perf_counts_values__scale(count, scale, NULL);
|
||||
evsel->counts->cpu[cpu] = *count;
|
||||
update_shadow_stats(evsel, count->values);
|
||||
break;
|
||||
case AGGR_GLOBAL:
|
||||
aggr->val += count->val;
|
||||
if (scale) {
|
||||
aggr->ena += count->ena;
|
||||
aggr->run += count->run;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_counter(struct perf_evsel *counter);
|
||||
|
||||
/*
|
||||
* Read out the results of a single counter:
|
||||
* aggregate counts across CPUs in system-wide mode
|
||||
*/
|
||||
static int read_counter_aggr(struct perf_evsel *counter)
|
||||
{
|
||||
struct perf_counts_values *aggr = &counter->counts->aggr;
|
||||
struct perf_stat *ps = counter->priv;
|
||||
u64 *count = counter->counts->aggr.values;
|
||||
int i;
|
||||
|
||||
if (__perf_evsel__read(counter, perf_evsel__nr_cpus(counter),
|
||||
thread_map__nr(evsel_list->threads), scale) < 0)
|
||||
aggr->val = aggr->ena = aggr->run = 0;
|
||||
|
||||
if (read_counter(counter))
|
||||
return -1;
|
||||
|
||||
if (!counter->snapshot)
|
||||
perf_evsel__compute_deltas(counter, -1, aggr);
|
||||
perf_counts_values__scale(aggr, scale, &counter->counts->scaled);
|
||||
|
||||
for (i = 0; i < 3; i++)
|
||||
update_stats(&ps->res_stats[i], count[i]);
|
||||
|
||||
@ -424,16 +506,21 @@ static int read_counter_aggr(struct perf_evsel *counter)
|
||||
*/
|
||||
static int read_counter(struct perf_evsel *counter)
|
||||
{
|
||||
u64 *count;
|
||||
int cpu;
|
||||
int nthreads = thread_map__nr(evsel_list->threads);
|
||||
int ncpus = perf_evsel__nr_cpus(counter);
|
||||
int cpu, thread;
|
||||
|
||||
for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
|
||||
if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
|
||||
return -1;
|
||||
if (counter->system_wide)
|
||||
nthreads = 1;
|
||||
|
||||
count = counter->counts->cpu[cpu].values;
|
||||
if (counter->per_pkg)
|
||||
zero_per_pkg(counter);
|
||||
|
||||
update_shadow_stats(counter, count);
|
||||
for (thread = 0; thread < nthreads; thread++) {
|
||||
for (cpu = 0; cpu < ncpus; cpu++) {
|
||||
if (perf_evsel__read_cb(counter, cpu, thread, read_cb))
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -149,6 +149,10 @@ static int parse_callchain_sort_key(const char *value)
|
||||
callchain_param.key = CCKEY_ADDRESS;
|
||||
return 0;
|
||||
}
|
||||
if (!strncmp(value, "branch", strlen(value))) {
|
||||
callchain_param.branch_callstack = 1;
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -63,6 +63,7 @@ struct callchain_param {
|
||||
sort_chain_func_t sort;
|
||||
enum chain_order order;
|
||||
enum chain_key key;
|
||||
bool branch_callstack;
|
||||
};
|
||||
|
||||
extern struct callchain_param callchain_param;
|
||||
|
@ -954,40 +954,6 @@ int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __perf_evsel__read(struct perf_evsel *evsel,
|
||||
int ncpus, int nthreads, bool scale)
|
||||
{
|
||||
size_t nv = scale ? 3 : 1;
|
||||
int cpu, thread;
|
||||
struct perf_counts_values *aggr = &evsel->counts->aggr, count;
|
||||
|
||||
if (evsel->system_wide)
|
||||
nthreads = 1;
|
||||
|
||||
aggr->val = aggr->ena = aggr->run = 0;
|
||||
|
||||
for (cpu = 0; cpu < ncpus; cpu++) {
|
||||
for (thread = 0; thread < nthreads; thread++) {
|
||||
if (FD(evsel, cpu, thread) < 0)
|
||||
continue;
|
||||
|
||||
if (readn(FD(evsel, cpu, thread),
|
||||
&count, nv * sizeof(u64)) < 0)
|
||||
return -errno;
|
||||
|
||||
aggr->val += count.val;
|
||||
if (scale) {
|
||||
aggr->ena += count.ena;
|
||||
aggr->run += count.run;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
perf_evsel__compute_deltas(evsel, -1, aggr);
|
||||
perf_counts_values__scale(aggr, scale, &evsel->counts->scaled);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread)
|
||||
{
|
||||
struct perf_evsel *leader = evsel->leader;
|
||||
|
@ -93,6 +93,7 @@ struct perf_evsel {
|
||||
bool system_wide;
|
||||
bool tracking;
|
||||
bool per_pkg;
|
||||
unsigned long *per_pkg_mask;
|
||||
/* parse modifier helper */
|
||||
int exclude_GH;
|
||||
int nr_members;
|
||||
@ -271,35 +272,6 @@ static inline int perf_evsel__read_on_cpu_scaled(struct perf_evsel *evsel,
|
||||
return __perf_evsel__read_on_cpu(evsel, cpu, thread, true);
|
||||
}
|
||||
|
||||
int __perf_evsel__read(struct perf_evsel *evsel, int ncpus, int nthreads,
|
||||
bool scale);
|
||||
|
||||
/**
|
||||
* perf_evsel__read - Read the aggregate results on all CPUs
|
||||
*
|
||||
* @evsel - event selector to read value
|
||||
* @ncpus - Number of cpus affected, from zero
|
||||
* @nthreads - Number of threads affected, from zero
|
||||
*/
|
||||
static inline int perf_evsel__read(struct perf_evsel *evsel,
|
||||
int ncpus, int nthreads)
|
||||
{
|
||||
return __perf_evsel__read(evsel, ncpus, nthreads, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* perf_evsel__read_scaled - Read the aggregate results on all CPUs, scaled
|
||||
*
|
||||
* @evsel - event selector to read value
|
||||
* @ncpus - Number of cpus affected, from zero
|
||||
* @nthreads - Number of threads affected, from zero
|
||||
*/
|
||||
static inline int perf_evsel__read_scaled(struct perf_evsel *evsel,
|
||||
int ncpus, int nthreads)
|
||||
{
|
||||
return __perf_evsel__read(evsel, ncpus, nthreads, true);
|
||||
}
|
||||
|
||||
int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
|
||||
struct perf_sample *sample);
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <stdbool.h>
|
||||
#include <symbol/kallsyms.h>
|
||||
#include "unwind.h"
|
||||
#include "linux/hash.h"
|
||||
|
||||
static void dsos__init(struct dsos *dsos)
|
||||
{
|
||||
@ -1391,7 +1392,11 @@ static int add_callchain_ip(struct thread *thread,
|
||||
|
||||
al.filtered = 0;
|
||||
al.sym = NULL;
|
||||
thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
|
||||
if (cpumode == -1)
|
||||
thread__find_cpumode_addr_location(thread, MAP__FUNCTION,
|
||||
ip, &al);
|
||||
else
|
||||
thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
|
||||
ip, &al);
|
||||
if (al.sym != NULL) {
|
||||
if (sort__has_parent && !*parent &&
|
||||
@ -1427,8 +1432,50 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
|
||||
return bi;
|
||||
}
|
||||
|
||||
#define CHASHSZ 127
|
||||
#define CHASHBITS 7
|
||||
#define NO_ENTRY 0xff
|
||||
|
||||
#define PERF_MAX_BRANCH_DEPTH 127
|
||||
|
||||
/* Remove loops. */
|
||||
static int remove_loops(struct branch_entry *l, int nr)
|
||||
{
|
||||
int i, j, off;
|
||||
unsigned char chash[CHASHSZ];
|
||||
|
||||
memset(chash, NO_ENTRY, sizeof(chash));
|
||||
|
||||
BUG_ON(PERF_MAX_BRANCH_DEPTH > 255);
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
int h = hash_64(l[i].from, CHASHBITS) % CHASHSZ;
|
||||
|
||||
/* no collision handling for now */
|
||||
if (chash[h] == NO_ENTRY) {
|
||||
chash[h] = i;
|
||||
} else if (l[chash[h]].from == l[i].from) {
|
||||
bool is_loop = true;
|
||||
/* check if it is a real loop */
|
||||
off = 0;
|
||||
for (j = chash[h]; j < i && i + off < nr; j++, off++)
|
||||
if (l[j].from != l[i + off].from) {
|
||||
is_loop = false;
|
||||
break;
|
||||
}
|
||||
if (is_loop) {
|
||||
memmove(l + i, l + i + off,
|
||||
(nr - (i + off)) * sizeof(*l));
|
||||
nr -= off;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nr;
|
||||
}
|
||||
|
||||
static int thread__resolve_callchain_sample(struct thread *thread,
|
||||
struct ip_callchain *chain,
|
||||
struct branch_stack *branch,
|
||||
struct symbol **parent,
|
||||
struct addr_location *root_al,
|
||||
int max_stack)
|
||||
@ -1438,22 +1485,82 @@ static int thread__resolve_callchain_sample(struct thread *thread,
|
||||
int i;
|
||||
int j;
|
||||
int err;
|
||||
int skip_idx __maybe_unused;
|
||||
|
||||
callchain_cursor_reset(&callchain_cursor);
|
||||
|
||||
if (chain->nr > PERF_MAX_STACK_DEPTH) {
|
||||
pr_warning("corrupted callchain. skipping...\n");
|
||||
return 0;
|
||||
}
|
||||
int skip_idx = -1;
|
||||
int first_call = 0;
|
||||
|
||||
/*
|
||||
* Based on DWARF debug information, some architectures skip
|
||||
* a callchain entry saved by the kernel.
|
||||
*/
|
||||
skip_idx = arch_skip_callchain_idx(thread, chain);
|
||||
if (chain->nr < PERF_MAX_STACK_DEPTH)
|
||||
skip_idx = arch_skip_callchain_idx(thread, chain);
|
||||
|
||||
for (i = 0; i < chain_nr; i++) {
|
||||
callchain_cursor_reset(&callchain_cursor);
|
||||
|
||||
/*
|
||||
* Add branches to call stack for easier browsing. This gives
|
||||
* more context for a sample than just the callers.
|
||||
*
|
||||
* This uses individual histograms of paths compared to the
|
||||
* aggregated histograms the normal LBR mode uses.
|
||||
*
|
||||
* Limitations for now:
|
||||
* - No extra filters
|
||||
* - No annotations (should annotate somehow)
|
||||
*/
|
||||
|
||||
if (branch && callchain_param.branch_callstack) {
|
||||
int nr = min(max_stack, (int)branch->nr);
|
||||
struct branch_entry be[nr];
|
||||
|
||||
if (branch->nr > PERF_MAX_BRANCH_DEPTH) {
|
||||
pr_warning("corrupted branch chain. skipping...\n");
|
||||
goto check_calls;
|
||||
}
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
if (callchain_param.order == ORDER_CALLEE) {
|
||||
be[i] = branch->entries[i];
|
||||
/*
|
||||
* Check for overlap into the callchain.
|
||||
* The return address is one off compared to
|
||||
* the branch entry. To adjust for this
|
||||
* assume the calling instruction is not longer
|
||||
* than 8 bytes.
|
||||
*/
|
||||
if (i == skip_idx ||
|
||||
chain->ips[first_call] >= PERF_CONTEXT_MAX)
|
||||
first_call++;
|
||||
else if (be[i].from < chain->ips[first_call] &&
|
||||
be[i].from >= chain->ips[first_call] - 8)
|
||||
first_call++;
|
||||
} else
|
||||
be[i] = branch->entries[branch->nr - i - 1];
|
||||
}
|
||||
|
||||
nr = remove_loops(be, nr);
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
err = add_callchain_ip(thread, parent, root_al,
|
||||
-1, be[i].to);
|
||||
if (!err)
|
||||
err = add_callchain_ip(thread, parent, root_al,
|
||||
-1, be[i].from);
|
||||
if (err == -EINVAL)
|
||||
break;
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
chain_nr -= nr;
|
||||
}
|
||||
|
||||
check_calls:
|
||||
if (chain->nr > PERF_MAX_STACK_DEPTH) {
|
||||
pr_warning("corrupted callchain. skipping...\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (i = first_call; i < chain_nr; i++) {
|
||||
u64 ip;
|
||||
|
||||
if (callchain_param.order == ORDER_CALLEE)
|
||||
@ -1517,6 +1624,7 @@ int thread__resolve_callchain(struct thread *thread,
|
||||
int max_stack)
|
||||
{
|
||||
int ret = thread__resolve_callchain_sample(thread, sample->callchain,
|
||||
sample->branch_stack,
|
||||
parent, root_al, max_stack);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
@ -102,7 +102,8 @@ struct symbol_conf {
|
||||
demangle,
|
||||
demangle_kernel,
|
||||
filter_relative,
|
||||
show_hist_headers;
|
||||
show_hist_headers,
|
||||
branch_callstack;
|
||||
const char *vmlinux_name,
|
||||
*kallsyms_name,
|
||||
*source_prefix,
|
||||
|
Loading…
Reference in New Issue
Block a user