perf/core improvements:

User visible:
 
 o Support handling complete branch stacks as histograms (Andi Kleen)
 
 Infrastructure:
 
 o Prep work for supporting per-pkg and snapshot counters in 'perf stat' (Jiri Olsa)
 
 Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQEcBAABAgAGBQJUfPO7AAoJEBpxZoYYoA71TvEIAKyJODdEj7oVjYMqehGZfLPU
 mR54xXWVOWaHWrsl/E7SiRudue7IdTnAzabQjq2VnxbDG7Y8oSJ8uYwHhAu9Sxtr
 WYGa+pAdnn5fwt/xm9Dqd0BuLI3DFB2t0aRvOJ9odPEBjHkLwgizFaACuKhWjSN7
 LiwUTxVxQn8z+M/xgmH4jKuFH+QG7LHVqNHgjrPOX23J8AEDxGzXEIws8vymIrzZ
 pYAHtQ5s6yNU1JOS04gX8DFcGHOZnkcgVyIdD9f/QNMXSAZB/GJBtYrvprzl0i8O
 AiyTKYMrK1Il5nJkESsjunNkWfnEwJw95BEwlVPIKsgPW3wNaBoWfUpfkFK7T+k=
 =V1km
 -----END PGP SIGNATURE-----

Merge tag 'perf-core-for-mingo-2' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core

Pull perf/core improvements from Arnaldo Carvalho de Melo:

User visible changes:

  - Support handling complete branch stacks as histograms (Andi Kleen)

Infrastructure changes:

  - Prep work for supporting per-pkg and snapshot counters in 'perf stat' (Jiri Olsa)

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2014-12-08 07:45:45 +01:00
commit cfa0bd52d0
9 changed files with 259 additions and 91 deletions

View File

@ -159,7 +159,7 @@ OPTIONS
--dump-raw-trace::
Dump raw trace in ASCII.
-g [type,min[,limit],order[,key]]::
-g [type,min[,limit],order[,key][,branch]]::
--call-graph::
Display call chains using type, min percent threshold, optional print
limit and order.
@ -177,6 +177,11 @@ OPTIONS
- function: compare on functions
- address: compare on individual code addresses
branch can be:
- branch: include last branch information in callgraph
when available. Usually more convenient to use --branch-history
for this.
Default: fractal,0.5,callee,function.
--children::
@ -266,6 +271,11 @@ OPTIONS
branch stacks and it will automatically switch to the branch view mode,
unless --no-branch-stack is used.
--branch-history::
Add the addresses of sampled taken branches to the callstack.
This allows to examine the path the program took to each sample.
The data collection must have used -b (or -j) and -g.
--objdump=<path>::
Path to objdump binary.

View File

@ -226,8 +226,9 @@ static int report__setup_sample_type(struct report *rep)
return -EINVAL;
}
if (symbol_conf.use_callchain) {
ui__error("Selected -g but no callchain data. Did "
"you call 'perf record' without -g?\n");
ui__error("Selected -g or --branch-history but no "
"callchain data. Did\n"
"you call 'perf record' without -g?\n");
return -1;
}
} else if (!rep->dont_use_callchains &&
@ -575,6 +576,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
struct stat st;
bool has_br_stack = false;
int branch_mode = -1;
bool branch_call_mode = false;
char callchain_default_opt[] = "fractal,0.5,callee";
const char * const report_usage[] = {
"perf report [<options>]",
@ -637,8 +639,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
"regex filter to identify parent, see: '--sort parent'"),
OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
"Only display entries with parent-match"),
OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
"Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). "
OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order[,branch]",
"Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address), add branches. "
"Default: fractal,0.5,callee,function", &report_parse_callchain_opt, callchain_default_opt),
OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain,
"Accumulate callchains of children and show total overhead as well"),
@ -684,7 +686,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
"Show event group information together"),
OPT_CALLBACK_NOOPT('b', "branch-stack", &branch_mode, "",
"use branch records for histogram filling", parse_branch_mode),
"use branch records for per branch histogram filling",
parse_branch_mode),
OPT_BOOLEAN(0, "branch-history", &branch_call_mode,
"add last branch records to call history"),
OPT_STRING(0, "objdump", &objdump_path, "path",
"objdump binary to use for disassembly and annotations"),
OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
@ -745,10 +750,24 @@ repeat:
has_br_stack = perf_header__has_feat(&session->header,
HEADER_BRANCH_STACK);
if ((branch_mode == -1 && has_br_stack) || branch_mode == 1) {
/*
* Branch mode is a tristate:
* -1 means default, so decide based on the file having branch data.
* 0/1 means the user chose a mode.
*/
if (((branch_mode == -1 && has_br_stack) || branch_mode == 1) &&
branch_call_mode == -1) {
sort__mode = SORT_MODE__BRANCH;
symbol_conf.cumulate_callchain = false;
}
if (branch_call_mode) {
callchain_param.key = CCKEY_ADDRESS;
callchain_param.branch_callstack = 1;
symbol_conf.use_callchain = true;
callchain_register_param(&callchain_param);
if (sort_order == NULL)
sort_order = "srcline,symbol,dso";
}
if (report.mem_mode) {
if (sort__mode == SORT_MODE__BRANCH) {

View File

@ -388,20 +388,102 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
update_stats(&runtime_itlb_cache_stats[0], count[0]);
}
static void zero_per_pkg(struct perf_evsel *counter)
{
if (counter->per_pkg_mask)
memset(counter->per_pkg_mask, 0, MAX_NR_CPUS);
}
static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip)
{
unsigned long *mask = counter->per_pkg_mask;
struct cpu_map *cpus = perf_evsel__cpus(counter);
int s;
*skip = false;
if (!counter->per_pkg)
return 0;
if (cpu_map__empty(cpus))
return 0;
if (!mask) {
mask = zalloc(MAX_NR_CPUS);
if (!mask)
return -ENOMEM;
counter->per_pkg_mask = mask;
}
s = cpu_map__get_socket(cpus, cpu);
if (s < 0)
return -1;
*skip = test_and_set_bit(s, mask) == 1;
return 0;
}
static int read_cb(struct perf_evsel *evsel, int cpu, int thread __maybe_unused,
struct perf_counts_values *count)
{
struct perf_counts_values *aggr = &evsel->counts->aggr;
static struct perf_counts_values zero;
bool skip = false;
if (check_per_pkg(evsel, cpu, &skip)) {
pr_err("failed to read per-pkg counter\n");
return -1;
}
if (skip)
count = &zero;
switch (aggr_mode) {
case AGGR_CORE:
case AGGR_SOCKET:
case AGGR_NONE:
if (!evsel->snapshot)
perf_evsel__compute_deltas(evsel, cpu, count);
perf_counts_values__scale(count, scale, NULL);
evsel->counts->cpu[cpu] = *count;
update_shadow_stats(evsel, count->values);
break;
case AGGR_GLOBAL:
aggr->val += count->val;
if (scale) {
aggr->ena += count->ena;
aggr->run += count->run;
}
default:
break;
}
return 0;
}
static int read_counter(struct perf_evsel *counter);
/*
* Read out the results of a single counter:
* aggregate counts across CPUs in system-wide mode
*/
static int read_counter_aggr(struct perf_evsel *counter)
{
struct perf_counts_values *aggr = &counter->counts->aggr;
struct perf_stat *ps = counter->priv;
u64 *count = counter->counts->aggr.values;
int i;
if (__perf_evsel__read(counter, perf_evsel__nr_cpus(counter),
thread_map__nr(evsel_list->threads), scale) < 0)
aggr->val = aggr->ena = aggr->run = 0;
if (read_counter(counter))
return -1;
if (!counter->snapshot)
perf_evsel__compute_deltas(counter, -1, aggr);
perf_counts_values__scale(aggr, scale, &counter->counts->scaled);
for (i = 0; i < 3; i++)
update_stats(&ps->res_stats[i], count[i]);
@ -424,16 +506,21 @@ static int read_counter_aggr(struct perf_evsel *counter)
*/
static int read_counter(struct perf_evsel *counter)
{
u64 *count;
int cpu;
int nthreads = thread_map__nr(evsel_list->threads);
int ncpus = perf_evsel__nr_cpus(counter);
int cpu, thread;
for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
return -1;
if (counter->system_wide)
nthreads = 1;
count = counter->counts->cpu[cpu].values;
if (counter->per_pkg)
zero_per_pkg(counter);
update_shadow_stats(counter, count);
for (thread = 0; thread < nthreads; thread++) {
for (cpu = 0; cpu < ncpus; cpu++) {
if (perf_evsel__read_cb(counter, cpu, thread, read_cb))
return -1;
}
}
return 0;

View File

@ -149,6 +149,10 @@ static int parse_callchain_sort_key(const char *value)
callchain_param.key = CCKEY_ADDRESS;
return 0;
}
if (!strncmp(value, "branch", strlen(value))) {
callchain_param.branch_callstack = 1;
return 0;
}
return -1;
}

View File

@ -63,6 +63,7 @@ struct callchain_param {
sort_chain_func_t sort;
enum chain_order order;
enum chain_key key;
bool branch_callstack;
};
extern struct callchain_param callchain_param;

View File

@ -954,40 +954,6 @@ int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
return 0;
}
int __perf_evsel__read(struct perf_evsel *evsel,
int ncpus, int nthreads, bool scale)
{
size_t nv = scale ? 3 : 1;
int cpu, thread;
struct perf_counts_values *aggr = &evsel->counts->aggr, count;
if (evsel->system_wide)
nthreads = 1;
aggr->val = aggr->ena = aggr->run = 0;
for (cpu = 0; cpu < ncpus; cpu++) {
for (thread = 0; thread < nthreads; thread++) {
if (FD(evsel, cpu, thread) < 0)
continue;
if (readn(FD(evsel, cpu, thread),
&count, nv * sizeof(u64)) < 0)
return -errno;
aggr->val += count.val;
if (scale) {
aggr->ena += count.ena;
aggr->run += count.run;
}
}
}
perf_evsel__compute_deltas(evsel, -1, aggr);
perf_counts_values__scale(aggr, scale, &evsel->counts->scaled);
return 0;
}
static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread)
{
struct perf_evsel *leader = evsel->leader;

View File

@ -93,6 +93,7 @@ struct perf_evsel {
bool system_wide;
bool tracking;
bool per_pkg;
unsigned long *per_pkg_mask;
/* parse modifier helper */
int exclude_GH;
int nr_members;
@ -271,35 +272,6 @@ static inline int perf_evsel__read_on_cpu_scaled(struct perf_evsel *evsel,
return __perf_evsel__read_on_cpu(evsel, cpu, thread, true);
}
int __perf_evsel__read(struct perf_evsel *evsel, int ncpus, int nthreads,
bool scale);
/**
* perf_evsel__read - Read the aggregate results on all CPUs
*
* @evsel - event selector to read value
* @ncpus - Number of cpus affected, from zero
* @nthreads - Number of threads affected, from zero
*/
static inline int perf_evsel__read(struct perf_evsel *evsel,
int ncpus, int nthreads)
{
return __perf_evsel__read(evsel, ncpus, nthreads, false);
}
/**
* perf_evsel__read_scaled - Read the aggregate results on all CPUs, scaled
*
* @evsel - event selector to read value
* @ncpus - Number of cpus affected, from zero
* @nthreads - Number of threads affected, from zero
*/
static inline int perf_evsel__read_scaled(struct perf_evsel *evsel,
int ncpus, int nthreads)
{
return __perf_evsel__read(evsel, ncpus, nthreads, true);
}
int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
struct perf_sample *sample);

View File

@ -12,6 +12,7 @@
#include <stdbool.h>
#include <symbol/kallsyms.h>
#include "unwind.h"
#include "linux/hash.h"
static void dsos__init(struct dsos *dsos)
{
@ -1391,7 +1392,11 @@ static int add_callchain_ip(struct thread *thread,
al.filtered = 0;
al.sym = NULL;
thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
if (cpumode == -1)
thread__find_cpumode_addr_location(thread, MAP__FUNCTION,
ip, &al);
else
thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
ip, &al);
if (al.sym != NULL) {
if (sort__has_parent && !*parent &&
@ -1427,8 +1432,50 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
return bi;
}
#define CHASHSZ 127
#define CHASHBITS 7
#define NO_ENTRY 0xff
#define PERF_MAX_BRANCH_DEPTH 127
/* Remove loops. */
static int remove_loops(struct branch_entry *l, int nr)
{
int i, j, off;
unsigned char chash[CHASHSZ];
memset(chash, NO_ENTRY, sizeof(chash));
BUG_ON(PERF_MAX_BRANCH_DEPTH > 255);
for (i = 0; i < nr; i++) {
int h = hash_64(l[i].from, CHASHBITS) % CHASHSZ;
/* no collision handling for now */
if (chash[h] == NO_ENTRY) {
chash[h] = i;
} else if (l[chash[h]].from == l[i].from) {
bool is_loop = true;
/* check if it is a real loop */
off = 0;
for (j = chash[h]; j < i && i + off < nr; j++, off++)
if (l[j].from != l[i + off].from) {
is_loop = false;
break;
}
if (is_loop) {
memmove(l + i, l + i + off,
(nr - (i + off)) * sizeof(*l));
nr -= off;
}
}
}
return nr;
}
static int thread__resolve_callchain_sample(struct thread *thread,
struct ip_callchain *chain,
struct branch_stack *branch,
struct symbol **parent,
struct addr_location *root_al,
int max_stack)
@ -1438,22 +1485,82 @@ static int thread__resolve_callchain_sample(struct thread *thread,
int i;
int j;
int err;
int skip_idx __maybe_unused;
callchain_cursor_reset(&callchain_cursor);
if (chain->nr > PERF_MAX_STACK_DEPTH) {
pr_warning("corrupted callchain. skipping...\n");
return 0;
}
int skip_idx = -1;
int first_call = 0;
/*
* Based on DWARF debug information, some architectures skip
* a callchain entry saved by the kernel.
*/
skip_idx = arch_skip_callchain_idx(thread, chain);
if (chain->nr < PERF_MAX_STACK_DEPTH)
skip_idx = arch_skip_callchain_idx(thread, chain);
for (i = 0; i < chain_nr; i++) {
callchain_cursor_reset(&callchain_cursor);
/*
* Add branches to call stack for easier browsing. This gives
* more context for a sample than just the callers.
*
* This uses individual histograms of paths compared to the
* aggregated histograms the normal LBR mode uses.
*
* Limitations for now:
* - No extra filters
* - No annotations (should annotate somehow)
*/
if (branch && callchain_param.branch_callstack) {
int nr = min(max_stack, (int)branch->nr);
struct branch_entry be[nr];
if (branch->nr > PERF_MAX_BRANCH_DEPTH) {
pr_warning("corrupted branch chain. skipping...\n");
goto check_calls;
}
for (i = 0; i < nr; i++) {
if (callchain_param.order == ORDER_CALLEE) {
be[i] = branch->entries[i];
/*
* Check for overlap into the callchain.
* The return address is one off compared to
* the branch entry. To adjust for this
* assume the calling instruction is not longer
* than 8 bytes.
*/
if (i == skip_idx ||
chain->ips[first_call] >= PERF_CONTEXT_MAX)
first_call++;
else if (be[i].from < chain->ips[first_call] &&
be[i].from >= chain->ips[first_call] - 8)
first_call++;
} else
be[i] = branch->entries[branch->nr - i - 1];
}
nr = remove_loops(be, nr);
for (i = 0; i < nr; i++) {
err = add_callchain_ip(thread, parent, root_al,
-1, be[i].to);
if (!err)
err = add_callchain_ip(thread, parent, root_al,
-1, be[i].from);
if (err == -EINVAL)
break;
if (err)
return err;
}
chain_nr -= nr;
}
check_calls:
if (chain->nr > PERF_MAX_STACK_DEPTH) {
pr_warning("corrupted callchain. skipping...\n");
return 0;
}
for (i = first_call; i < chain_nr; i++) {
u64 ip;
if (callchain_param.order == ORDER_CALLEE)
@ -1517,6 +1624,7 @@ int thread__resolve_callchain(struct thread *thread,
int max_stack)
{
int ret = thread__resolve_callchain_sample(thread, sample->callchain,
sample->branch_stack,
parent, root_al, max_stack);
if (ret)
return ret;

View File

@ -102,7 +102,8 @@ struct symbol_conf {
demangle,
demangle_kernel,
filter_relative,
show_hist_headers;
show_hist_headers,
branch_callstack;
const char *vmlinux_name,
*kallsyms_name,
*source_prefix,