d99c22eabe
To control degree of parallelism of the synthesize_mmap() code which is scanning /proc/PID/task/PID/maps and can be time consuming. Mimic perf top way of handling the option. If not specified will default to 1 thread, i.e. default behavior before this option. On a desktop computer the processing of /proc/PID/task/PID/maps isn't slow enough to warrant parallel processing and the thread creation has some cost - hence the default of 1. On a loaded server with >100 cores it is possible to see synthesis times in the order of seconds and in this case having the option is desirable. As the processing is a synchronization point, it is legitimate to worry if Amdahl's law will apply to this patch. Profiling with this patch in place: https://lore.kernel.org/lkml/20200415054050.31645-4-irogers@google.com/ shows: ... - 32.59% __perf_event__synthesize_threads - 32.54% __event__synthesize_thread + 22.13% perf_event__synthesize_mmap_events + 6.68% perf_event__get_comm_ids.constprop.0 + 1.49% process_synthesized_event + 1.29% __GI___readdir64 + 0.60% __opendir ... That is the processing is 1.49% of execution time and there is plenty to make parallel. This is shown in the benchmark in this patch: https://lore.kernel.org/lkml/20200415054050.31645-2-irogers@google.com/ Computing performance of multi threaded perf event synthesis by synthesizing events on CPU 0: Number of synthesis threads: 1 Average synthesis took: 127729.000 usec (+- 3372.880 usec) Average num. events: 21548.600 (+- 0.306) Average time per event 5.927 usec Number of synthesis threads: 2 Average synthesis took: 88863.500 usec (+- 385.168 usec) Average num. events: 21552.800 (+- 0.327) Average time per event 4.123 usec Number of synthesis threads: 3 Average synthesis took: 83257.400 usec (+- 348.617 usec) Average num. events: 21553.200 (+- 0.327) Average time per event 3.863 usec Number of synthesis threads: 4 Average synthesis took: 75093.000 usec (+- 422.978 usec) Average num. events: 21554.200 (+- 0.200) Average time per event 3.484 usec Number of synthesis threads: 5 Average synthesis took: 64896.600 usec (+- 353.348 usec) Average num. events: 21558.000 (+- 0.000) Average time per event 3.010 usec Number of synthesis threads: 6 Average synthesis took: 59210.200 usec (+- 342.890 usec) Average num. events: 21560.000 (+- 0.000) Average time per event 2.746 usec Number of synthesis threads: 7 Average synthesis took: 54093.900 usec (+- 306.247 usec) Average num. events: 21562.000 (+- 0.000) Average time per event 2.509 usec Number of synthesis threads: 8 Average synthesis took: 48938.700 usec (+- 341.732 usec) Average num. events: 21564.000 (+- 0.000) Average time per event 2.269 usec Where average time per synthesized event goes from 5.927 usec with 1 thread to 2.269 usec with 8. This isn't a linear speed up as not all of synthesize code has been made parallel. If the synthesis time was about 10 seconds then using 8 threads may bring this down to less than 4. Signed-off-by: Stephane Eranian <eranian@google.com> Reviewed-by: Ian Rogers <irogers@google.com> Acked-by: Jiri Olsa <jolsa@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexey Budankov <alexey.budankov@linux.intel.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Tony Jones <tonyj@suse.de> Cc: yuzhoujian <yuzhoujian@didichuxing.com> Link: http://lore.kernel.org/lkml/20200422155038.9380-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
80 lines
2.0 KiB
C
80 lines
2.0 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _PERF_RECORD_H
|
|
#define _PERF_RECORD_H
|
|
|
|
#include <time.h>
|
|
#include <stdbool.h>
|
|
#include <linux/types.h>
|
|
#include <linux/stddef.h>
|
|
#include <linux/perf_event.h>
|
|
#include "util/target.h"
|
|
|
|
struct option;
|
|
|
|
struct record_opts {
|
|
struct target target;
|
|
bool group;
|
|
bool inherit_stat;
|
|
bool no_buffering;
|
|
bool no_inherit;
|
|
bool no_inherit_set;
|
|
bool no_samples;
|
|
bool raw_samples;
|
|
bool sample_address;
|
|
bool sample_phys_addr;
|
|
bool sample_weight;
|
|
bool sample_time;
|
|
bool sample_time_set;
|
|
bool sample_cpu;
|
|
bool period;
|
|
bool period_set;
|
|
bool running_time;
|
|
bool full_auxtrace;
|
|
bool auxtrace_snapshot_mode;
|
|
bool auxtrace_snapshot_on_exit;
|
|
bool auxtrace_sample_mode;
|
|
bool record_namespaces;
|
|
bool record_cgroup;
|
|
bool record_switch_events;
|
|
bool all_kernel;
|
|
bool all_user;
|
|
bool kernel_callchains;
|
|
bool user_callchains;
|
|
bool tail_synthesize;
|
|
bool overwrite;
|
|
bool ignore_missing_thread;
|
|
bool strict_freq;
|
|
bool sample_id;
|
|
bool no_bpf_event;
|
|
bool kcore;
|
|
unsigned int freq;
|
|
unsigned int mmap_pages;
|
|
unsigned int auxtrace_mmap_pages;
|
|
unsigned int user_freq;
|
|
u64 branch_stack;
|
|
u64 sample_intr_regs;
|
|
u64 sample_user_regs;
|
|
u64 default_interval;
|
|
u64 user_interval;
|
|
size_t auxtrace_snapshot_size;
|
|
const char *auxtrace_snapshot_opts;
|
|
const char *auxtrace_sample_opts;
|
|
bool sample_transaction;
|
|
unsigned initial_delay;
|
|
bool use_clockid;
|
|
clockid_t clockid;
|
|
u64 clockid_res_ns;
|
|
int nr_cblocks;
|
|
int affinity;
|
|
int mmap_flush;
|
|
unsigned int comp_level;
|
|
unsigned int nr_threads_synthesize;
|
|
};
|
|
|
|
extern const char * const *record_usage;
|
|
extern struct option *record_options;
|
|
|
|
int record__parse_freq(const struct option *opt, const char *str, int unset);
|
|
|
|
#endif // _PERF_RECORD_H
|