From cfbd41b786519d4a15e1c15181556689bcf6635a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 15 Apr 2020 12:31:26 -0300 Subject: [PATCH 01/60] perf stat: Honour --timeout for forked workloads When --timeout is used and a workload is specified to be started by 'perf stat', i.e. $ perf stat --timeout 1000 sleep 1h The --timeout wasn't being honoured, i.e. the workload, 'sleep 1h' in the above example, should be terminated after 1000ms, but it wasn't, 'perf stat' was waiting for it to finish. Fix it by sending a SIGTERM when the timeout expires. Now it works: # perf stat -e cycles --timeout 1234 sleep 1h sleep: Terminated Performance counter stats for 'sleep 1h': 1,066,692 cycles 1.234314838 seconds time elapsed 0.000750000 seconds user 0.000000000 seconds sys # Fixes: f1f8ad52f8bf ("perf stat: Add support to print counts after a period of time") Reported-by: Konstantin Kharlamov Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=207243 Tested-by: Konstantin Kharlamov Cc: Adrian Hunter Acked-by: Jiri Olsa Tested-by: Jiri Olsa Cc: Namhyung Kim Cc: yuzhoujian Link: https://lore.kernel.org/lkml/20200415153803.GB20324@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index ec053dc1e35c..9207b6c45475 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -686,8 +686,11 @@ try_again_reset: break; } } - if (child_pid != -1) + if (child_pid != -1) { + if (timeout) + kill(child_pid, SIGTERM); wait4(child_pid, &status, 0, &stat_config.ru_data); + } if (workload_exec_errno) { const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); From 943930e4729a64c11142a0370415663b39189996 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:56:08 +0100 Subject: [PATCH 02/60] perf tools: Synthesize bpf_trampoline/dispatcher ksymbol event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Synthesize bpf images (trampolines/dispatchers) on start, as ksymbol events from /proc/kallsyms. Having this perf can recognize samples from those images and perf report and top shows them correctly. The rest of the ksymbol handling is already in place from for the bpf programs monitoring, so only the initial state was needed. perf report output: # Overhead Command Shared Object Symbol 12.37% test_progs [kernel.vmlinux] [k] entry_SYSCALL_64 11.80% test_progs [kernel.vmlinux] [k] syscall_return_via_sysret 9.63% test_progs bpf_prog_bcf7977d3b93787c_prog2 [k] bpf_prog_bcf7977d3b93787c_prog2 6.90% test_progs bpf_trampoline_24456 [k] bpf_trampoline_24456 6.36% test_progs [kernel.vmlinux] [k] memcpy_erms Committer notes: Use scnprintf() instead of strncpy() to overcome this on fedora:32, rawhide and OpenMandriva Cooker: CC /tmp/build/perf/util/bpf-event.o In file included from /usr/include/string.h:495, from /git/linux/tools/lib/bpf/libbpf_common.h:12, from /git/linux/tools/lib/bpf/bpf.h:31, from util/bpf-event.c:4: In function 'strncpy', inlined from 'process_bpf_image' at util/bpf-event.c:323:2, inlined from 'kallsyms_process_symbol' at util/bpf-event.c:358:9: /usr/include/bits/string_fortified.h:106:10: error: '__builtin_strncpy' specified bound 256 equals destination size [-Werror=stringop-truncation] 106 | return __builtin___strncpy_chk (__dest, __src, __len, __bos (__dest)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors Signed-off-by: Jiri Olsa Acked-by: Song Liu Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Björn Töpel Cc: Daniel Borkmann Cc: David S. Miller Cc: Jakub Kicinski Cc: Jesper Dangaard Brouer Cc: John Fastabend Cc: Martin KaFai Lau Cc: Yonghong Song Link: https://lore.kernel.org/bpf/20200312195610.346362-14-jolsa@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-event.c | 93 +++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index a3207d900339..0cd41a862952 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include #include "bpf-event.h" #include "debug.h" #include "dso.h" @@ -290,11 +293,82 @@ out: return err ? -1 : 0; } +struct kallsyms_parse { + union perf_event *event; + perf_event__handler_t process; + struct machine *machine; + struct perf_tool *tool; +}; + +static int +process_bpf_image(char *name, u64 addr, struct kallsyms_parse *data) +{ + struct machine *machine = data->machine; + union perf_event *event = data->event; + struct perf_record_ksymbol *ksymbol; + int len; + + ksymbol = &event->ksymbol; + + *ksymbol = (struct perf_record_ksymbol) { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = offsetof(struct perf_record_ksymbol, name), + }, + .addr = addr, + .len = page_size, + .ksym_type = PERF_RECORD_KSYMBOL_TYPE_BPF, + .flags = 0, + }; + + len = scnprintf(ksymbol->name, KSYM_NAME_LEN, "%s", name); + ksymbol->header.size += PERF_ALIGN(len + 1, sizeof(u64)); + memset((void *) event + event->header.size, 0, machine->id_hdr_size); + event->header.size += machine->id_hdr_size; + + return perf_tool__process_synth_event(data->tool, event, machine, + data->process); +} + +static int +kallsyms_process_symbol(void *data, const char *_name, + char type __maybe_unused, u64 start) +{ + char disp[KSYM_NAME_LEN]; + char *module, *name; + unsigned long id; + int err = 0; + + module = strchr(_name, '\t'); + if (!module) + return 0; + + /* We are going after [bpf] module ... */ + if (strcmp(module + 1, "[bpf]")) + return 0; + + name = memdup(_name, (module - _name) + 1); + if (!name) + return -ENOMEM; + + name[module - _name] = 0; + + /* .. and only for trampolines and dispatchers */ + if ((sscanf(name, "bpf_trampoline_%lu", &id) == 1) || + (sscanf(name, "bpf_dispatcher_%s", disp) == 1)) + err = process_bpf_image(name, start, data); + + free(name); + return err; +} + int perf_event__synthesize_bpf_events(struct perf_session *session, perf_event__handler_t process, struct machine *machine, struct record_opts *opts) { + const char *kallsyms_filename = "/proc/kallsyms"; + struct kallsyms_parse arg; union perf_event *event; __u32 id = 0; int err; @@ -303,6 +377,8 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, event = malloc(sizeof(event->bpf) + KSYM_NAME_LEN + machine->id_hdr_size); if (!event) return -1; + + /* Synthesize all the bpf programs in system. */ while (true) { err = bpf_prog_get_next_id(id, &id); if (err) { @@ -335,6 +411,23 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, break; } } + + /* Synthesize all the bpf images - trampolines/dispatchers. */ + if (symbol_conf.kallsyms_name != NULL) + kallsyms_filename = symbol_conf.kallsyms_name; + + arg = (struct kallsyms_parse) { + .event = event, + .process = process, + .machine = machine, + .tool = session->tool, + }; + + if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol)) { + pr_err("%s: failed to synthesize bpf images: %s\n", + __func__, strerror(errno)); + } + free(event); return err; } From 7eddf7e74e54aea3b24410b3fb8911927836632f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:56:09 +0100 Subject: [PATCH 03/60] perf machine: Set ksymbol dso as loaded on arrival MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no special load action for ksymbol data on map__load/dso__load action, where the kernel is getting loaded. It only gets confused with kernel kallsyms/vmlinux load for bpf object, which fails and could mess up with the map. Disabling any further load of the map for ksymbol related dso/map. Signed-off-by: Jiri Olsa Acked-by: Song Liu Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Björn Töpel Cc: Daniel Borkmann Cc: David S. Miller Cc: Jakub Kicinski Cc: Jesper Dangaard Brouer Cc: John Fastabend Cc: Martin KaFai Lau Cc: Yonghong Song Link: https://lore.kernel.org/bpf/20200312195610.346362-15-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 97142e9671be..06aa4e4db63d 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -759,6 +759,7 @@ static int machine__process_ksymbol_register(struct machine *machine, map->start = event->ksymbol.addr; map->end = map->start + event->ksymbol.len; maps__insert(&machine->kmaps, map); + dso__set_loaded(dso); } sym = symbol__new(map->map_ip(map, map->start), From 3c29d4483e855b6ba5c6e35b0c81caad7d9e3984 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:56:10 +0100 Subject: [PATCH 04/60] perf annotate: Add basic support for bpf_image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the DSO_BINARY_TYPE__BPF_IMAGE dso binary type to recognize BPF images that carry trampoline or dispatcher. Upcoming patches will add support to read the image data, store it within the BPF feature in perf.data and display it for annotation purposes. Currently we only display following message: # ./perf annotate bpf_trampoline_24456 --stdio Percent | Source code & Disassembly of . for cycles (504 ... --------------------------------------------------------------- ... : to be implemented Signed-off-by: Jiri Olsa Acked-by: Song Liu Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Björn Töpel Cc: Daniel Borkmann Cc: David S. Miller Cc: Jakub Kicinski Cc: Jesper Dangaard Brouer Cc: John Fastabend Cc: Martin KaFai Lau Cc: Yonghong Song Link: https://lore.kernel.org/bpf/20200312195610.346362-16-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 20 ++++++++++++++++++++ tools/perf/util/dso.c | 1 + tools/perf/util/dso.h | 1 + tools/perf/util/machine.c | 11 +++++++++++ tools/perf/util/symbol.c | 1 + 5 files changed, 34 insertions(+) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index f1ea0d61eb5b..9760d58e979a 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1821,6 +1821,24 @@ static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, } #endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +static int +symbol__disassemble_bpf_image(struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct disasm_line *dl; + + args->offset = -1; + args->line = strdup("to be implemented"); + args->line_nr = 0; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + free(args->line); + return 0; +} + /* * Possibly create a new version of line with tabs expanded. Returns the * existing or new line, storage is updated if a new line is allocated. If @@ -1920,6 +1938,8 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) { return symbol__disassemble_bpf(sym, args); + } else if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) { + return symbol__disassemble_bpf_image(sym, args); } else if (dso__is_kcore(dso)) { kce.kcore_filename = symfs_filename; kce.addr = map__rip_2objdump(map, sym->start); diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 91f21239608b..f338990e0fe6 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -191,6 +191,7 @@ int dso__read_binary_type_filename(const struct dso *dso, case DSO_BINARY_TYPE__GUEST_KALLSYMS: case DSO_BINARY_TYPE__JAVA_JIT: case DSO_BINARY_TYPE__BPF_PROG_INFO: + case DSO_BINARY_TYPE__BPF_IMAGE: case DSO_BINARY_TYPE__NOT_FOUND: ret = -1; break; diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 2db64b79617a..9553a1fd9e8a 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -40,6 +40,7 @@ enum dso_binary_type { DSO_BINARY_TYPE__GUEST_KCORE, DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO, DSO_BINARY_TYPE__BPF_PROG_INFO, + DSO_BINARY_TYPE__BPF_IMAGE, DSO_BINARY_TYPE__NOT_FOUND, }; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 06aa4e4db63d..09845eae9c03 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -736,6 +736,12 @@ int machine__process_switch_event(struct machine *machine __maybe_unused, return 0; } +static int is_bpf_image(const char *name) +{ + return strncmp(name, "bpf_trampoline_", sizeof("bpf_trampoline_") - 1) || + strncmp(name, "bpf_dispatcher_", sizeof("bpf_dispatcher_") - 1); +} + static int machine__process_ksymbol_register(struct machine *machine, union perf_event *event, struct perf_sample *sample __maybe_unused) @@ -760,6 +766,11 @@ static int machine__process_ksymbol_register(struct machine *machine, map->end = map->start + event->ksymbol.len; maps__insert(&machine->kmaps, map); dso__set_loaded(dso); + + if (is_bpf_image(event->ksymbol.name)) { + dso->binary_type = DSO_BINARY_TYPE__BPF_IMAGE; + dso__set_long_name(dso, "", false); + } } sym = symbol__new(map->map_ip(map, map->start), diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 26bc6a0096ce..8f4300492dc7 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1544,6 +1544,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod, return true; case DSO_BINARY_TYPE__BPF_PROG_INFO: + case DSO_BINARY_TYPE__BPF_IMAGE: case DSO_BINARY_TYPE__NOT_FOUND: default: return false; From 980737282232b752bb14dab96d77665c15889c36 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:45:31 +0300 Subject: [PATCH 05/60] capabilities: Introduce CAP_PERFMON to kernel and user space Introduce the CAP_PERFMON capability designed to secure system performance monitoring and observability operations so that CAP_PERFMON can assist CAP_SYS_ADMIN capability in its governing role for performance monitoring and observability subsystems. CAP_PERFMON hardens system security and integrity during performance monitoring and observability operations by decreasing attack surface that is available to a CAP_SYS_ADMIN privileged process [2]. Providing the access to system performance monitoring and observability operations under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes the operation more secure. Thus, CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e: 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) CAP_PERFMON meets the demand to secure system performance monitoring and observability operations for adoption in security sensitive, restricted, multiuser production environments (e.g. HPC clusters, cloud and virtual compute environments), where root or CAP_SYS_ADMIN credentials are not available to mass users of a system, and securely unblocks applicability and scalability of system performance monitoring and observability operations beyond root and CAP_SYS_ADMIN use cases. CAP_PERFMON takes over CAP_SYS_ADMIN credentials related to system performance monitoring and observability operations and balances amount of CAP_SYS_ADMIN credentials following the recommendations in the capabilities man page [1] for CAP_SYS_ADMIN: "Note: this capability is overloaded; see Notes to kernel developers, below." For backward compatibility reasons access to system performance monitoring and observability subsystems of the kernel remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN capability usage for secure system performance monitoring and observability operations is discouraged with respect to the designed CAP_PERFMON capability. Although the software running under CAP_PERFMON can not ensure avoidance of related hardware issues, the software can still mitigate these issues following the official hardware issues mitigation procedure [2]. The bugs in the software itself can be fixed following the standard kernel development process [3] to maintain and harden security of system performance monitoring and observability operations. [1] http://man7.org/linux/man-pages/man7/capabilities.7.html [2] https://www.kernel.org/doc/html/latest/process/embargoed-hardware-issues.html [3] https://www.kernel.org/doc/html/latest/admin-guide/security-bugs.html Signed-off-by: Alexey Budankov Acked-by: James Morris Acked-by: Serge E. Hallyn Acked-by: Song Liu Acked-by: Stephen Smalley Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/5590d543-82c6-490a-6544-08e6a5517db0@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/capability.h | 4 ++++ include/uapi/linux/capability.h | 8 +++++++- security/selinux/include/classmap.h | 4 ++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index ecce0f43c73a..027d7e4a853b 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); +static inline bool perfmon_capable(void) +{ + return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); +} /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 272dc69fa080..e58c9636741b 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -367,8 +367,14 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_READ 37 +/* + * Allow system performance and observability privileged operations + * using perf_events, i915_perf and other kernel subsystems + */ -#define CAP_LAST_CAP CAP_AUDIT_READ +#define CAP_PERFMON 38 + +#define CAP_LAST_CAP CAP_PERFMON #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 986f3ac14282..d233ab3f1533 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read" + "wake_alarm", "block_suspend", "audit_read", "perfmon" -#if CAP_LAST_CAP > CAP_AUDIT_READ +#if CAP_LAST_CAP > CAP_PERFMON #error New capability defined, please update COMMON_CAP2_PERMS. #endif From 18aa18566218d4a46d940049b835314d2b071cc2 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:46:24 +0300 Subject: [PATCH 06/60] perf/core: Open access to the core for CAP_PERFMON privileged process Open access to monitoring of kernel code, CPUs, tracepoints and namespaces data for a CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons the access to perf_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: linux-man@vger.kernel.org Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/471acaef-bb8a-5ce2-923f-90606b78eef9@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 6 +++--- kernel/events/core.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9c3e7619c929..87e21681759c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1305,7 +1305,7 @@ static inline int perf_is_paranoid(void) static inline int perf_allow_kernel(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_KERNEL); @@ -1313,7 +1313,7 @@ static inline int perf_allow_kernel(struct perf_event_attr *attr) static inline int perf_allow_cpu(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); @@ -1321,7 +1321,7 @@ static inline int perf_allow_cpu(struct perf_event_attr *attr) static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..74025b7b83a0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11504,7 +11504,7 @@ SYSCALL_DEFINE5(perf_event_open, } if (attr.namespaces) { - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; } From c9e0924e5c2b59365f9c0d43ff8722e79ecf4088 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:47:01 +0300 Subject: [PATCH 07/60] perf/core: open access to probes for CAP_PERFMON privileged process Open access to monitoring via kprobes and uprobes and eBPF tracing for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. perf kprobes and uprobes are used by ftrace and eBPF. perf probe uses ftrace to define new kprobe events, and those events are treated as tracepoint events. eBPF defines new probes via perf_event_open interface and then the probes are used in eBPF tracing. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to perf_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Cc: linux-man@vger.kernel.org Link: http://lore.kernel.org/lkml/3c129d9a-ba8a-3483-ecc5-ad6c8e7c203f@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 74025b7b83a0..52951e9e8e1b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9397,7 +9397,7 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -9457,7 +9457,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* From 6b3e0e2e04615df128b2d38fa1dd1fcb84f2504c Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:47:35 +0300 Subject: [PATCH 08/60] perf tools: Support CAP_PERFMON capability Extend error messages to mention CAP_PERFMON capability as an option to substitute CAP_SYS_ADMIN capability for secure system performance monitoring and observability operations. Make perf_event_paranoid_check() and __cmd_ftrace() to be aware of CAP_PERFMON capability. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to perf_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged with respect to CAP_PERFMON capability. Committer testing: Using a libcap with this patch: diff --git a/libcap/include/uapi/linux/capability.h b/libcap/include/uapi/linux/capability.h index 78b2fd4c8a95..89b5b0279b60 100644 --- a/libcap/include/uapi/linux/capability.h +++ b/libcap/include/uapi/linux/capability.h @@ -366,8 +366,9 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_READ 37 +#define CAP_PERFMON 38 -#define CAP_LAST_CAP CAP_AUDIT_READ +#define CAP_LAST_CAP CAP_PERFMON #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) Note that using '38' in place of 'cap_perfmon' works to some degree with an old libcap, its only when cap_get_flag() is called that libcap performs an error check based on the maximum value known for capabilities that it will fail. This makes determining the default of perf_event_attr.exclude_kernel to fail, as it can't determine if CAP_PERFMON is in place. Using 'perf top -e cycles' avoids the default check and sets perf_event_attr.exclude_kernel to 1. As root, with a libcap supporting CAP_PERFMON: # groupadd perf_users # adduser perf -g perf_users # mkdir ~perf/bin # cp ~acme/bin/perf ~perf/bin/ # chgrp perf_users ~perf/bin/perf # setcap "cap_perfmon,cap_sys_ptrace,cap_syslog=ep" ~perf/bin/perf # getcap ~perf/bin/perf /home/perf/bin/perf = cap_sys_ptrace,cap_syslog,cap_perfmon+ep # ls -la ~perf/bin/perf -rwxr-xr-x. 1 root perf_users 16968552 Apr 9 13:10 /home/perf/bin/perf As the 'perf' user in the 'perf_users' group: $ perf top -a --stdio Error: Failed to mmap with 1 (Operation not permitted) $ Either add the cap_ipc_lock capability to the perf binary or reduce the ring buffer size to some smaller value: $ perf top -m10 -a --stdio rounding mmap pages size to 64K (16 pages) Error: Failed to mmap with 1 (Operation not permitted) $ perf top -m4 -a --stdio Error: Failed to mmap with 1 (Operation not permitted) $ perf top -m2 -a --stdio PerfTop: 762 irqs/sec kernel:49.7% exact: 100.0% lost: 0/0 drop: 0/0 [4000Hz cycles], (all, 4 CPUs) ------------------------------------------------------------------------------------------------------ 9.83% perf [.] __symbols__insert 8.58% perf [.] rb_next 5.91% [kernel] [k] module_get_kallsym 5.66% [kernel] [k] kallsyms_expand_symbol.constprop.0 3.98% libc-2.29.so [.] __GI_____strtoull_l_internal 3.66% perf [.] rb_insert_color 2.34% [kernel] [k] vsnprintf 2.30% [kernel] [k] string_nocheck 2.16% libc-2.29.so [.] _IO_getdelim 2.15% [kernel] [k] number 2.13% [kernel] [k] format_decode 1.58% libc-2.29.so [.] _IO_feof 1.52% libc-2.29.so [.] __strcmp_avx2 1.50% perf [.] rb_set_parent_color 1.47% libc-2.29.so [.] __libc_calloc 1.24% [kernel] [k] do_syscall_64 1.17% [kernel] [k] __x86_indirect_thunk_rax $ perf record -a sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.552 MB perf.data (74 samples) ] $ perf evlist cycles $ perf evlist -v cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1 $ perf report | head -20 # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 74 of event 'cycles' # Event count (approx.): 15694834 # # Overhead Command Shared Object Symbol # ........ ............... .......................... ...................................... # 19.62% perf [kernel.vmlinux] [k] strnlen_user 13.88% swapper [kernel.vmlinux] [k] intel_idle 13.83% ksoftirqd/0 [kernel.vmlinux] [k] pfifo_fast_dequeue 13.51% swapper [kernel.vmlinux] [k] kmem_cache_free 6.31% gnome-shell [kernel.vmlinux] [k] kmem_cache_free 5.66% kworker/u8:3+ix [kernel.vmlinux] [k] delay_tsc 4.42% perf [kernel.vmlinux] [k] __set_cpus_allowed_ptr 3.45% kworker/2:1-eve [kernel.vmlinux] [k] shmem_truncate_range 2.29% gnome-shell libgobject-2.0.so.0.6000.7 [.] g_closure_ref $ Signed-off-by: Alexey Budankov Reviewed-by: James Morris Acked-by: Jiri Olsa Acked-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/a66d5648-2b8e-577e-e1f2-1d56c017ab5e@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-ftrace.c | 5 +++-- tools/perf/design.txt | 3 ++- tools/perf/util/cap.h | 4 ++++ tools/perf/util/evsel.c | 10 +++++----- tools/perf/util/util.c | 1 + 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index d5adc417a4ca..55eda54240fb 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -284,10 +284,11 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) .events = POLLIN, }; - if (!perf_cap__capable(CAP_SYS_ADMIN)) { + if (!(perf_cap__capable(CAP_PERFMON) || + perf_cap__capable(CAP_SYS_ADMIN))) { pr_err("ftrace only works for %s!\n", #ifdef HAVE_LIBCAP_SUPPORT - "users with the SYS_ADMIN capability" + "users with the CAP_PERFMON or CAP_SYS_ADMIN capability" #else "root" #endif diff --git a/tools/perf/design.txt b/tools/perf/design.txt index 0453ba26cdbd..a42fab308ff6 100644 --- a/tools/perf/design.txt +++ b/tools/perf/design.txt @@ -258,7 +258,8 @@ gets schedule to. Per task counters can be created by any user, for their own tasks. A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts -all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. +all events on CPU-x. Per CPU counters need CAP_PERFMON or CAP_SYS_ADMIN +privilege. The 'flags' parameter is currently unused and must be zero. diff --git a/tools/perf/util/cap.h b/tools/perf/util/cap.h index 051dc590ceee..ae52878c0b2e 100644 --- a/tools/perf/util/cap.h +++ b/tools/perf/util/cap.h @@ -29,4 +29,8 @@ static inline bool perf_cap__capable(int cap __maybe_unused) #define CAP_SYSLOG 34 #endif +#ifndef CAP_PERFMON +#define CAP_PERFMON 38 +#endif + #endif /* __PERF_CAP_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index eb880efbce16..d23db6755f51 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2523,14 +2523,14 @@ int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, "You may not have permission to collect %sstats.\n\n" "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n" "which controls use of the performance events system by\n" - "unprivileged users (without CAP_SYS_ADMIN).\n\n" + "unprivileged users (without CAP_PERFMON or CAP_SYS_ADMIN).\n\n" "The current value is %d:\n\n" " -1: Allow use of (almost) all events by all users\n" " Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK\n" - ">= 0: Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN\n" - " Disallow raw tracepoint access by users without CAP_SYS_ADMIN\n" - ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n" - ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN\n\n" + ">= 0: Disallow ftrace function tracepoint by users without CAP_PERFMON or CAP_SYS_ADMIN\n" + " Disallow raw tracepoint access by users without CAP_SYS_PERFMON or CAP_SYS_ADMIN\n" + ">= 1: Disallow CPU event access by users without CAP_PERFMON or CAP_SYS_ADMIN\n" + ">= 2: Disallow kernel profiling by users without CAP_PERFMON or CAP_SYS_ADMIN\n\n" "To make this setting permanent, edit /etc/sysctl.conf too, e.g.:\n\n" " kernel.perf_event_paranoid = -1\n" , target->system_wide ? "system-wide " : "", diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index d707c9624dd9..37a9492edb3e 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -290,6 +290,7 @@ int perf_event_paranoid(void) bool perf_event_paranoid_check(int max_level) { return perf_cap__capable(CAP_SYS_ADMIN) || + perf_cap__capable(CAP_PERFMON) || perf_event_paranoid() <= max_level; } From 4e3d3456b78fa5a70e65de0d7c5309b814281ae3 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:48:15 +0300 Subject: [PATCH 09/60] drm/i915/perf: Open access for CAP_PERFMON privileged process Open access to i915_perf monitoring for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to i915_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure i915_events monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Acked-by: Lionel Landwerlin Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/e3e3292f-f765-ea98-e59c-fbe2db93fd34@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- drivers/gpu/drm/i915/i915_perf.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 551be589d6f4..5fb174931231 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -3433,10 +3433,10 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf, /* Similar to perf's kernel.perf_paranoid_cpu sysctl option * we check a dev.i915.perf_stream_paranoid sysctl option * to determine if it's ok to access system wide OA counters - * without CAP_SYS_ADMIN privileges. + * without CAP_PERFMON or CAP_SYS_ADMIN privileges. */ if (privileged_op && - i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) { + i915_perf_stream_paranoid && !perfmon_capable()) { DRM_DEBUG("Insufficient privileges to open i915 perf stream\n"); ret = -EACCES; goto err_ctx; @@ -3629,9 +3629,8 @@ static int read_properties_unlocked(struct i915_perf *perf, } else oa_freq_hz = 0; - if (oa_freq_hz > i915_oa_max_sample_rate && - !capable(CAP_SYS_ADMIN)) { - DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without root privileges\n", + if (oa_freq_hz > i915_oa_max_sample_rate && !perfmon_capable()) { + DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without CAP_PERFMON or CAP_SYS_ADMIN privileges\n", i915_oa_max_sample_rate); return -EACCES; } @@ -4052,7 +4051,7 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, void *data, return -EINVAL; } - if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) { + if (i915_perf_stream_paranoid && !perfmon_capable()) { DRM_DEBUG("Insufficient privileges to add i915 OA config\n"); return -EACCES; } @@ -4199,7 +4198,7 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data, return -ENOTSUPP; } - if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) { + if (i915_perf_stream_paranoid && !perfmon_capable()) { DRM_DEBUG("Insufficient privileges to remove i915 OA config\n"); return -EACCES; } From 031258da05956646c5606023ab0abe10a7e68ea1 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:48:54 +0300 Subject: [PATCH 10/60] trace/bpf_trace: Open access for CAP_PERFMON privileged process Open access to bpf_trace monitoring for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to bpf_trace monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure bpf_trace monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Acked-by: Song Liu Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/c0a0ae47-8b6e-ff3e-416b-3cd1faaf71c0@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..d7d88007dc6d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1468,7 +1468,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) u32 *ids, prog_cnt, ids_len; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; From ff46758313e688fca7d762b3e6ead32843999511 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:49:36 +0300 Subject: [PATCH 11/60] powerpc/perf: open access for CAP_PERFMON privileged process Open access to monitoring for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to the monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Acked-by: Anju T Sudhakar Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/ac98cd9f-b59e-673c-c70d-180b3e7695d2@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/powerpc/perf/imc-pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index eb82dda884e5..0edcfd0b491d 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -976,7 +976,7 @@ static int thread_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* Sampling not supported */ @@ -1412,7 +1412,7 @@ static int trace_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* Return if this is a couting event */ From cf91baf3f7f39a0cd29072e21ed0e4bb1ab3b382 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:50:15 +0300 Subject: [PATCH 12/60] parisc/perf: open access for CAP_PERFMON privileged process Open access to monitoring for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to the monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Acked-by: Helge Deller Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/8cc98809-d35b-de0f-de02-4cf554f3cf62@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/parisc/kernel/perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index e1a8fee3ad49..d46b6709ec56 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -300,7 +300,7 @@ static ssize_t perf_write(struct file *file, const char __user *buf, else return -EFAULT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; if (count != sizeof(uint32_t)) From cea7d0d4a59b4efd0e1fe067130b4c06ab4d412f Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:51:21 +0300 Subject: [PATCH 13/60] drivers/perf: Open access for CAP_PERFMON privileged process Open access to monitoring for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to the monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Acked-by: Will Deacon Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/4ec1d6f7-548c-8d1c-f84a-cebeb9674e4e@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- drivers/perf/arm_spe_pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index b72c04852599..0e0961a2b405 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -274,7 +274,7 @@ static u64 arm_spe_event_to_pmscr(struct perf_event *event) if (!attr->exclude_kernel) reg |= BIT(SYS_PMSCR_EL1_E1SPE_SHIFT); - if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && capable(CAP_SYS_ADMIN)) + if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && perfmon_capable()) reg |= BIT(SYS_PMSCR_EL1_CX_SHIFT); return reg; @@ -700,7 +700,7 @@ static int arm_spe_pmu_event_init(struct perf_event *event) return -EOPNOTSUPP; reg = arm_spe_event_to_pmscr(event); - if (!capable(CAP_SYS_ADMIN) && + if (!perfmon_capable() && (reg & (BIT(SYS_PMSCR_EL1_PA_SHIFT) | BIT(SYS_PMSCR_EL1_CX_SHIFT) | BIT(SYS_PMSCR_EL1_PCT_SHIFT)))) From ab76878bb720cbd35a05ae868387f4373a58c949 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:53:07 +0300 Subject: [PATCH 14/60] drivers/oprofile: Open access for CAP_PERFMON privileged process Open access to monitoring for CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons access to the monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Acked-by: James Morris Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/691f1096-b15f-9b12-50a0-c2b93918149e@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- drivers/oprofile/event_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c index 12ea4a4ad607..6c9edc8bbc95 100644 --- a/drivers/oprofile/event_buffer.c +++ b/drivers/oprofile/event_buffer.c @@ -113,7 +113,7 @@ static int event_buffer_open(struct inode *inode, struct file *file) { int err = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (test_and_set_bit_lock(0, &buffer_opened)) From 902a8dcc5ba6c5dc3332e8806b01be2f0f7ef2e4 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:54:01 +0300 Subject: [PATCH 15/60] doc/admin-guide: Update perf-security.rst with CAP_PERFMON information Update perf-security.rst documentation file with the information related to usage of CAP_PERFMON capability to secure performance monitoring and observability operations in system. Committer notes: While testing 'perf top' under cap_perfmon I noticed that it needs some more capability and Alexey pointed out cap_ipc_lock, as needed by this kernel chunk: kernel/events/core.c: 6101 if ((locked > lock_limit) && perf_is_paranoid() && !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; } So I added it to the documentation, and also mentioned that if the libcap version doesn't yet supports 'cap_perfmon', its numeric value can be used instead, i.e. if: # setcap "cap_perfmon,cap_ipc_lock,cap_sys_ptrace,cap_syslog=ep" perf Fails, try: # setcap "38,cap_ipc_lock,cap_sys_ptrace,cap_syslog=ep" perf I also added a paragraph stating that using an unpatched libcap will fail the check for CAP_PERFMON, as it checks the cap number against a maximum to see if it is valid, which makes it use as the default the 'cycles:u' event, even tho a cap_perfmon capable perf binary can get kernel samples, to workaround that just use, e.g.: # perf top -e cycles # perf record -e cycles And it will sample kernel and user modes. Signed-off-by: Alexey Budankov Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: James Morris Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/17278551-9399-9ebe-d665-8827016a217d@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- Documentation/admin-guide/perf-security.rst | 84 +++++++++++++++------ 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/Documentation/admin-guide/perf-security.rst b/Documentation/admin-guide/perf-security.rst index 72effa7c23b9..1307b5274a0f 100644 --- a/Documentation/admin-guide/perf-security.rst +++ b/Documentation/admin-guide/perf-security.rst @@ -1,6 +1,6 @@ .. _perf_security: -Perf Events and tool security +Perf events and tool security ============================= Overview @@ -42,11 +42,11 @@ categories: Data that belong to the fourth category can potentially contain sensitive process data. If PMUs in some monitoring modes capture values of execution context registers or data from process memory then access -to such monitoring capabilities requires to be ordered and secured -properly. So, perf_events/Perf performance monitoring is the subject for -security access control management [5]_ . +to such monitoring modes requires to be ordered and secured properly. +So, perf_events performance monitoring and observability operations are +the subject for security access control management [5]_ . -perf_events/Perf access control +perf_events access control ------------------------------- To perform security checks, the Linux implementation splits processes @@ -66,11 +66,25 @@ into distinct units, known as capabilities [6]_ , which can be independently enabled and disabled on per-thread basis for processes and files of unprivileged users. -Unprivileged processes with enabled CAP_SYS_ADMIN capability are treated +Unprivileged processes with enabled CAP_PERFMON capability are treated as privileged processes with respect to perf_events performance -monitoring and bypass *scope* permissions checks in the kernel. +monitoring and observability operations, thus, bypass *scope* permissions +checks in the kernel. CAP_PERFMON implements the principle of least +privilege [13]_ (POSIX 1003.1e: 2.2.2.39) for performance monitoring and +observability operations in the kernel and provides a secure approach to +perfomance monitoring and observability in the system. -Unprivileged processes using perf_events system call API is also subject +For backward compatibility reasons the access to perf_events monitoring and +observability operations is also open for CAP_SYS_ADMIN privileged +processes but CAP_SYS_ADMIN usage for secure monitoring and observability +use cases is discouraged with respect to the CAP_PERFMON capability. +If system audit records [14]_ for a process using perf_events system call +API contain denial records of acquiring both CAP_PERFMON and CAP_SYS_ADMIN +capabilities then providing the process with CAP_PERFMON capability singly +is recommended as the preferred secure approach to resolve double access +denial logging related to usage of performance monitoring and observability. + +Unprivileged processes using perf_events system call are also subject for PTRACE_MODE_READ_REALCREDS ptrace access mode check [7]_ , whose outcome determines whether monitoring is permitted. So unprivileged processes provided with CAP_SYS_PTRACE capability are effectively @@ -82,14 +96,14 @@ performance analysis of monitored processes or a system. For example, CAP_SYSLOG capability permits reading kernel space memory addresses from /proc/kallsyms file. -perf_events/Perf privileged users +Privileged Perf users groups --------------------------------- Mechanisms of capabilities, privileged capability-dumb files [6]_ and -file system ACLs [10]_ can be used to create a dedicated group of -perf_events/Perf privileged users who are permitted to execute -performance monitoring without scope limits. The following steps can be -taken to create such a group of privileged Perf users. +file system ACLs [10]_ can be used to create dedicated groups of +privileged Perf users who are permitted to execute performance monitoring +and observability without scope limits. The following steps can be +taken to create such groups of privileged Perf users. 1. Create perf_users group of privileged Perf users, assign perf_users group to Perf tool executable and limit access to the executable for @@ -108,30 +122,51 @@ taken to create such a group of privileged Perf users. -rwxr-x--- 2 root perf_users 11M Oct 19 15:12 perf 2. Assign the required capabilities to the Perf tool executable file and - enable members of perf_users group with performance monitoring + enable members of perf_users group with monitoring and observability privileges [6]_ : :: - # setcap "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf - # setcap -v "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf + # setcap "cap_perfmon,cap_sys_ptrace,cap_syslog=ep" perf + # setcap -v "cap_perfmon,cap_sys_ptrace,cap_syslog=ep" perf perf: OK # getcap perf - perf = cap_sys_ptrace,cap_sys_admin,cap_syslog+ep + perf = cap_sys_ptrace,cap_syslog,cap_perfmon+ep + +If the libcap installed doesn't yet support "cap_perfmon", use "38" instead, +i.e.: + +:: + + # setcap "38,cap_ipc_lock,cap_sys_ptrace,cap_syslog=ep" perf + +Note that you may need to have 'cap_ipc_lock' in the mix for tools such as +'perf top', alternatively use 'perf top -m N', to reduce the memory that +it uses for the perf ring buffer, see the memory allocation section below. + +Using a libcap without support for CAP_PERFMON will make cap_get_flag(caps, 38, +CAP_EFFECTIVE, &val) fail, which will lead the default event to be 'cycles:u', +so as a workaround explicitly ask for the 'cycles' event, i.e.: + +:: + + # perf top -e cycles + +To get kernel and user samples with a perf binary with just CAP_PERFMON. As a result, members of perf_users group are capable of conducting -performance monitoring by using functionality of the configured Perf -tool executable that, when executes, passes perf_events subsystem scope -checks. +performance monitoring and observability by using functionality of the +configured Perf tool executable that, when executes, passes perf_events +subsystem scope checks. This specific access control management is only available to superuser or root running processes with CAP_SETPCAP, CAP_SETFCAP [6]_ capabilities. -perf_events/Perf unprivileged users +Unprivileged users ----------------------------------- -perf_events/Perf *scope* and *access* control for unprivileged processes +perf_events *scope* and *access* control for unprivileged processes is governed by perf_event_paranoid [2]_ setting: -1: @@ -166,7 +201,7 @@ is governed by perf_event_paranoid [2]_ setting: perf_event_mlock_kb locking limit is imposed but ignored for unprivileged processes with CAP_IPC_LOCK capability. -perf_events/Perf resource control +Resource control --------------------------------- Open file descriptors @@ -227,4 +262,5 @@ Bibliography .. [10] ``_ .. [11] ``_ .. [12] ``_ - +.. [13] ``_ +.. [14] ``_ From 025b16f81dd7f51f29d0109399d669438c63b6ce Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:54:39 +0300 Subject: [PATCH 16/60] doc/admin-guide: update kernel.rst with CAP_PERFMON information Update the kernel.rst documentation file with the information related to usage of CAP_PERFMON capability to secure performance monitoring and observability operations in system. Signed-off-by: Alexey Budankov Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: James Morris Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/84c32383-14a2-fa35-16b6-f9e59bd37240@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- Documentation/admin-guide/sysctl/kernel.rst | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 39c95c0e13d3..7e4c28dfc9ca 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -730,7 +730,13 @@ perf_event_paranoid =================== Controls use of the performance events system by unprivileged -users (without CAP_SYS_ADMIN). The default value is 2. +users (without CAP_PERFMON). The default value is 2. + +For backward compatibility reasons access to system performance +monitoring and observability remains open for CAP_SYS_ADMIN +privileged processes but CAP_SYS_ADMIN usage for secure system +performance monitoring and observability operations is discouraged +with respect to CAP_PERFMON use cases. === ================================================================== -1 Allow use of (almost) all events by all users. @@ -739,13 +745,13 @@ users (without CAP_SYS_ADMIN). The default value is 2. ``CAP_IPC_LOCK``. >=0 Disallow ftrace function tracepoint by users without - ``CAP_SYS_ADMIN``. + ``CAP_PERFMON``. - Disallow raw tracepoint access by users without ``CAP_SYS_ADMIN``. + Disallow raw tracepoint access by users without ``CAP_PERFMON``. ->=1 Disallow CPU event access by users without ``CAP_SYS_ADMIN``. +>=1 Disallow CPU event access by users without ``CAP_PERFMON``. ->=2 Disallow kernel profiling by users without ``CAP_SYS_ADMIN``. +>=2 Disallow kernel profiling by users without ``CAP_PERFMON``. === ================================================================== From 1a2725f3ee5571cf07966f467b73a9941bcbacb8 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 2 Apr 2020 17:15:48 +0300 Subject: [PATCH 17/60] perf script: Simplify auxiliary event printing functions This simplifies the print functions for the following perf script options: --show-task-events --show-namespace-events --show-cgroup-events --show-mmap-events --show-switch-events --show-lost-events --show-bpf-events Example: # perf record --switch-events -a -e cycles -c 10000 sleep 1 Before: # perf script --show-task-events --show-namespace-events --show-cgroup-events --show-mmap-events --show-switch-events --show-lost-events --show-bpf-events > out-before.txt After: # perf script --show-task-events --show-namespace-events --show-cgroup-events --show-mmap-events --show-switch-events --show-lost-events --show-bpf-events > out-after.txt # diff -s out-before.txt out-after.txt Files out-before.txt and out-after.tx are identical Signed-off-by: Adrian Hunter Acked-by: Jiri Olsa Link: http://lore.kernel.org/lkml/20200402141548.21283-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 312 ++++++++---------------------------- 1 file changed, 70 insertions(+), 242 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 1f57a7ecdf3d..8bf3ba280312 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2040,7 +2040,7 @@ static int cleanup_scripting(void) static bool filter_cpu(struct perf_sample *sample) { - if (cpu_list) + if (cpu_list && sample->cpu != (u32)-1) return !test_bit(sample->cpu, cpu_bitmap); return false; } @@ -2138,41 +2138,59 @@ static int process_attr(struct perf_tool *tool, union perf_event *event, return err; } +static int print_event_with_time(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine, + pid_t pid, pid_t tid, u64 timestamp) +{ + struct perf_script *script = container_of(tool, struct perf_script, tool); + struct perf_session *session = script->session; + struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); + struct thread *thread = NULL; + + if (evsel && !evsel->core.attr.sample_id_all) { + sample->cpu = 0; + sample->time = timestamp; + sample->pid = pid; + sample->tid = tid; + } + + if (filter_cpu(sample)) + return 0; + + if (tid != -1) + thread = machine__findnew_thread(machine, pid, tid); + + if (thread && evsel) { + perf_sample__fprintf_start(sample, thread, evsel, + event->header.type, stdout); + } + + perf_event__fprintf(event, stdout); + + thread__put(thread); + + return 0; +} + +static int print_event(struct perf_tool *tool, union perf_event *event, + struct perf_sample *sample, struct machine *machine, + pid_t pid, pid_t tid) +{ + return print_event_with_time(tool, event, sample, machine, pid, tid, 0); +} + static int process_comm_event(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - int ret = -1; - - thread = machine__findnew_thread(machine, event->comm.pid, event->comm.tid); - if (thread == NULL) { - pr_debug("problem processing COMM event, skipping it.\n"); - return -1; - } - if (perf_event__process_comm(tool, event, sample, machine) < 0) - goto out; + return -1; - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - sample->tid = event->comm.tid; - sample->pid = event->comm.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_COMM, stdout); - perf_event__fprintf(event, stdout); - } - ret = 0; -out: - thread__put(thread); - return ret; + return print_event(tool, event, sample, machine, event->comm.pid, + event->comm.tid); } static int process_namespaces_event(struct perf_tool *tool, @@ -2180,37 +2198,11 @@ static int process_namespaces_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - int ret = -1; - - thread = machine__findnew_thread(machine, event->namespaces.pid, - event->namespaces.tid); - if (thread == NULL) { - pr_debug("problem processing NAMESPACES event, skipping it.\n"); - return -1; - } - if (perf_event__process_namespaces(tool, event, sample, machine) < 0) - goto out; + return -1; - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - sample->tid = event->namespaces.tid; - sample->pid = event->namespaces.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_NAMESPACES, stdout); - perf_event__fprintf(event, stdout); - } - ret = 0; -out: - thread__put(thread); - return ret; + return print_event(tool, event, sample, machine, event->namespaces.pid, + event->namespaces.tid); } static int process_cgroup_event(struct perf_tool *tool, @@ -2218,34 +2210,11 @@ static int process_cgroup_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - int ret = -1; - - thread = machine__findnew_thread(machine, sample->pid, sample->tid); - if (thread == NULL) { - pr_debug("problem processing CGROUP event, skipping it.\n"); - return -1; - } - if (perf_event__process_cgroup(tool, event, sample, machine) < 0) - goto out; + return -1; - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_CGROUP, stdout); - perf_event__fprintf(event, stdout); - } - ret = 0; -out: - thread__put(thread); - return ret; + return print_event(tool, event, sample, machine, sample->pid, + sample->tid); } static int process_fork_event(struct perf_tool *tool, @@ -2253,69 +2222,24 @@ static int process_fork_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - if (perf_event__process_fork(tool, event, sample, machine) < 0) return -1; - thread = machine__findnew_thread(machine, event->fork.pid, event->fork.tid); - if (thread == NULL) { - pr_debug("problem processing FORK event, skipping it.\n"); - return -1; - } - - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = event->fork.time; - sample->tid = event->fork.tid; - sample->pid = event->fork.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_FORK, stdout); - perf_event__fprintf(event, stdout); - } - thread__put(thread); - - return 0; + return print_event_with_time(tool, event, sample, machine, + event->fork.pid, event->fork.tid, + event->fork.time); } static int process_exit_event(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) { - int err = 0; - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - - thread = machine__findnew_thread(machine, event->fork.pid, event->fork.tid); - if (thread == NULL) { - pr_debug("problem processing EXIT event, skipping it.\n"); + /* Print before 'exit' deletes anything */ + if (print_event_with_time(tool, event, sample, machine, event->fork.pid, + event->fork.tid, event->fork.time)) return -1; - } - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - sample->tid = event->fork.tid; - sample->pid = event->fork.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_EXIT, stdout); - perf_event__fprintf(event, stdout); - } - - if (perf_event__process_exit(tool, event, sample, machine) < 0) - err = -1; - - thread__put(thread); - return err; + return perf_event__process_exit(tool, event, sample, machine); } static int process_mmap_event(struct perf_tool *tool, @@ -2323,33 +2247,11 @@ static int process_mmap_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - if (perf_event__process_mmap(tool, event, sample, machine) < 0) return -1; - thread = machine__findnew_thread(machine, event->mmap.pid, event->mmap.tid); - if (thread == NULL) { - pr_debug("problem processing MMAP event, skipping it.\n"); - return -1; - } - - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - sample->tid = event->mmap.tid; - sample->pid = event->mmap.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_MMAP, stdout); - perf_event__fprintf(event, stdout); - } - thread__put(thread); - return 0; + return print_event(tool, event, sample, machine, event->mmap.pid, + event->mmap.tid); } static int process_mmap2_event(struct perf_tool *tool, @@ -2357,33 +2259,11 @@ static int process_mmap2_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - if (perf_event__process_mmap2(tool, event, sample, machine) < 0) return -1; - thread = machine__findnew_thread(machine, event->mmap2.pid, event->mmap2.tid); - if (thread == NULL) { - pr_debug("problem processing MMAP2 event, skipping it.\n"); - return -1; - } - - if (!evsel->core.attr.sample_id_all) { - sample->cpu = 0; - sample->time = 0; - sample->tid = event->mmap2.tid; - sample->pid = event->mmap2.pid; - } - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_MMAP2, stdout); - perf_event__fprintf(event, stdout); - } - thread__put(thread); - return 0; + return print_event(tool, event, sample, machine, event->mmap2.pid, + event->mmap2.tid); } static int process_switch_event(struct perf_tool *tool, @@ -2391,10 +2271,7 @@ static int process_switch_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); if (perf_event__process_switch(tool, event, sample, machine) < 0) return -1; @@ -2405,20 +2282,8 @@ static int process_switch_event(struct perf_tool *tool, if (!script->show_switch_events) return 0; - thread = machine__findnew_thread(machine, sample->pid, - sample->tid); - if (thread == NULL) { - pr_debug("problem processing SWITCH event, skipping it.\n"); - return -1; - } - - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_SWITCH, stdout); - perf_event__fprintf(event, stdout); - } - thread__put(thread); - return 0; + return print_event(tool, event, sample, machine, sample->pid, + sample->tid); } static int @@ -2427,23 +2292,8 @@ process_lost_event(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine) { - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - struct thread *thread; - - thread = machine__findnew_thread(machine, sample->pid, - sample->tid); - if (thread == NULL) - return -1; - - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_LOST, stdout); - perf_event__fprintf(event, stdout); - } - thread__put(thread); - return 0; + return print_event(tool, event, sample, machine, sample->pid, + sample->tid); } static int @@ -2462,33 +2312,11 @@ process_bpf_events(struct perf_tool *tool __maybe_unused, struct perf_sample *sample, struct machine *machine) { - struct thread *thread; - struct perf_script *script = container_of(tool, struct perf_script, tool); - struct perf_session *session = script->session; - struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); - if (machine__process_ksymbol(machine, event, sample) < 0) return -1; - if (!evsel->core.attr.sample_id_all) { - perf_event__fprintf(event, stdout); - return 0; - } - - thread = machine__findnew_thread(machine, sample->pid, sample->tid); - if (thread == NULL) { - pr_debug("problem processing MMAP event, skipping it.\n"); - return -1; - } - - if (!filter_cpu(sample)) { - perf_sample__fprintf_start(sample, thread, evsel, - event->header.type, stdout); - perf_event__fprintf(event, stdout); - } - - thread__put(thread); - return 0; + return print_event(tool, event, sample, machine, sample->pid, + sample->tid); } static void sig_handler(int sig __maybe_unused) From 2a4b51666af8bf0b67ccc2e53120bad27351917c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 Apr 2020 08:43:53 -0700 Subject: [PATCH 18/60] perf bench: Add event synthesis benchmark Event synthesis may occur at the start or end (tail) of a perf command. In system-wide mode it can scan every process in /proc, which may add seconds of latency before event recording. Add a new benchmark that times how long event synthesis takes with and without data synthesis. An example execution looks like: $ perf bench internals synthesize # Running 'internals/synthesize' benchmark: Average synthesis took: 168.253800 usec Average data synthesis took: 208.104700 usec Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andrey Zhizhikin Cc: Kan Liang Cc: Kefeng Wang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lore.kernel.org/lkml/20200402154357.107873-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-bench.txt | 8 ++ tools/perf/bench/Build | 2 +- tools/perf/bench/bench.h | 2 +- tools/perf/bench/synthesize.c | 101 ++++++++++++++++++++++++ tools/perf/builtin-bench.c | 6 ++ 5 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 tools/perf/bench/synthesize.c diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index 0921a3c67381..bad16512c48d 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -61,6 +61,9 @@ SUBSYSTEM 'epoll':: Eventpoll (epoll) stressing benchmarks. +'internals':: + Benchmark internal perf functionality. + 'all':: All benchmark subsystems. @@ -214,6 +217,11 @@ Suite for evaluating concurrent epoll_wait calls. *ctl*:: Suite for evaluating multiple epoll_ctl calls. +SUITES FOR 'internals' +~~~~~~~~~~~~~~~~~~~~~~ +*synthesize*:: +Suite for evaluating perf's event synthesis performance. + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index e4e321b6f883..042827385c87 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -6,9 +6,9 @@ perf-y += futex-wake.o perf-y += futex-wake-parallel.o perf-y += futex-requeue.o perf-y += futex-lock-pi.o - perf-y += epoll-wait.o perf-y += epoll-ctl.o +perf-y += synthesize.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 4aa6de1aa67d..4d669c803237 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -41,9 +41,9 @@ int bench_futex_wake_parallel(int argc, const char **argv); int bench_futex_requeue(int argc, const char **argv); /* pi futexes */ int bench_futex_lock_pi(int argc, const char **argv); - int bench_epoll_wait(int argc, const char **argv); int bench_epoll_ctl(int argc, const char **argv); +int bench_synthesize(int argc, const char **argv); #define BENCH_FORMAT_DEFAULT_STR "default" #define BENCH_FORMAT_DEFAULT 0 diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c new file mode 100644 index 000000000000..6291257bc9c9 --- /dev/null +++ b/tools/perf/bench/synthesize.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark synthesis of perf events such as at the start of a 'perf + * record'. Synthesis is done on the current process and the 'dummy' event + * handlers are invoked that support dump_trace but otherwise do nothing. + * + * Copyright 2019 Google LLC. + */ +#include +#include "bench.h" +#include "../util/debug.h" +#include "../util/session.h" +#include "../util/synthetic-events.h" +#include "../util/target.h" +#include "../util/thread_map.h" +#include "../util/tool.h" +#include +#include +#include + +static unsigned int iterations = 10000; + +static const struct option options[] = { + OPT_UINTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average"), + OPT_END() +}; + +static const char *const usage[] = { + "perf bench internals synthesize ", + NULL +}; + + +static int do_synthesize(struct perf_session *session, + struct perf_thread_map *threads, + struct target *target, bool data_mmap) +{ + const unsigned int nr_threads_synthesize = 1; + struct timeval start, end, diff; + u64 runtime_us; + unsigned int i; + double average; + int err; + + gettimeofday(&start, NULL); + for (i = 0; i < iterations; i++) { + err = machine__synthesize_threads(&session->machines.host, + target, threads, data_mmap, + nr_threads_synthesize); + if (err) + return err; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + average = (double)runtime_us/(double)iterations; + printf("Average %ssynthesis took: %f usec\n", + data_mmap ? "data " : "", average); + return 0; +} + +int bench_synthesize(int argc, const char **argv) +{ + struct perf_tool tool; + struct perf_session *session; + struct target target = { + .pid = "self", + }; + struct perf_thread_map *threads; + int err; + + argc = parse_options(argc, argv, options, usage, 0); + + session = perf_session__new(NULL, false, NULL); + if (IS_ERR(session)) { + pr_err("Session creation failed.\n"); + return PTR_ERR(session); + } + threads = thread_map__new_by_pid(getpid()); + if (!threads) { + pr_err("Thread map creation failed.\n"); + err = -ENOMEM; + goto err_out; + } + perf_tool__fill_defaults(&tool); + + err = do_synthesize(session, threads, &target, false); + if (err) + goto err_out; + + err = do_synthesize(session, threads, &target, true); + +err_out: + if (threads) + perf_thread_map__put(threads); + + perf_session__delete(session); + return err; +} diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index c06fe21c8613..11c79a8d85d6 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -76,6 +76,11 @@ static struct bench epoll_benchmarks[] = { }; #endif // HAVE_EVENTFD +static struct bench internals_benchmarks[] = { + { "synthesize", "Benchmark perf event synthesis", bench_synthesize }, + { NULL, NULL, NULL } +}; + struct collection { const char *name; const char *summary; @@ -92,6 +97,7 @@ static struct collection collections[] = { #ifdef HAVE_EVENTFD {"epoll", "Epoll stressing benchmarks", epoll_benchmarks }, #endif + { "internals", "Perf-internals benchmarks", internals_benchmarks }, { "all", "All benchmarks", NULL }, { NULL, NULL, NULL } }; From c6fddb28bad26e5472cb7acf7b04cd5126f1a4ab Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 2 Apr 2020 08:43:54 -0700 Subject: [PATCH 19/60] tools api fs: Make xxx__mountpoint() more scalable The xxx_mountpoint() interface provided by fs.c finds mount points for common pseudo filesystems. The first time xxx_mountpoint() is invoked, it scans the mount table (/proc/mounts) looking for a match. If found, it is cached. The price to scan /proc/mounts is paid once if the mount is found. When the mount point is not found, subsequent calls to xxx_mountpoint() scan /proc/mounts over and over again. There is no caching. This causes a scaling issue in perf record with hugeltbfs__mountpoint(). The function is called for each process found in synthesize__mmap_events(). If the machine has thousands of processes and if the /proc/mounts has many entries this could cause major overhead in perf record. We have observed multi-second slowdowns on some configurations. As an example on a laptop: Before: $ sudo umount /dev/hugepages $ strace -e trace=openat -o /tmp/tt perf record -a ls $ fgrep mounts /tmp/tt 285 After: $ sudo umount /dev/hugepages $ strace -e trace=openat -o /tmp/tt perf record -a ls $ fgrep mounts /tmp/tt 1 One could argue that the non-caching in case the moint point is not found is intentional. That way subsequent calls may discover a moint point if the sysadmin mounts the filesystem. But the same argument could be made against caching the mount point. It could be unmounted causing errors. It all depends on the intent of the interface. This patch assumes it is expected to scan /proc/mounts once. The patch documents the caching behavior in the fs.h header file. An alternative would be to just fix perf record. But it would solve the problem with hugetlbs__mountpoint() but there could be similar issues (possibly down the line) with other xxx_mountpoint() calls in perf or other tools. Signed-off-by: Stephane Eranian Reviewed-by: Ian Rogers Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andrey Zhizhikin Cc: Kan Liang Cc: Kefeng Wang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Petr Mladek Cc: Thomas Gleixner Link: http://lore.kernel.org/lkml/20200402154357.107873-3-irogers@google.com Signed-off-by: Ian Rogers Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fs/fs.c | 17 +++++++++++++++++ tools/lib/api/fs/fs.h | 12 ++++++++++++ 2 files changed, 29 insertions(+) diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c index 027b18f7ed8c..82f53d81a7a7 100644 --- a/tools/lib/api/fs/fs.c +++ b/tools/lib/api/fs/fs.c @@ -90,6 +90,7 @@ struct fs { const char * const *mounts; char path[PATH_MAX]; bool found; + bool checked; long magic; }; @@ -111,31 +112,37 @@ static struct fs fs__entries[] = { .name = "sysfs", .mounts = sysfs__fs_known_mountpoints, .magic = SYSFS_MAGIC, + .checked = false, }, [FS__PROCFS] = { .name = "proc", .mounts = procfs__known_mountpoints, .magic = PROC_SUPER_MAGIC, + .checked = false, }, [FS__DEBUGFS] = { .name = "debugfs", .mounts = debugfs__known_mountpoints, .magic = DEBUGFS_MAGIC, + .checked = false, }, [FS__TRACEFS] = { .name = "tracefs", .mounts = tracefs__known_mountpoints, .magic = TRACEFS_MAGIC, + .checked = false, }, [FS__HUGETLBFS] = { .name = "hugetlbfs", .mounts = hugetlbfs__known_mountpoints, .magic = HUGETLBFS_MAGIC, + .checked = false, }, [FS__BPF_FS] = { .name = "bpf", .mounts = bpf_fs__known_mountpoints, .magic = BPF_FS_MAGIC, + .checked = false, }, }; @@ -158,6 +165,7 @@ static bool fs__read_mounts(struct fs *fs) } fclose(fp); + fs->checked = true; return fs->found = found; } @@ -220,6 +228,7 @@ static bool fs__env_override(struct fs *fs) return false; fs->found = true; + fs->checked = true; strncpy(fs->path, override_path, sizeof(fs->path) - 1); fs->path[sizeof(fs->path) - 1] = '\0'; return true; @@ -246,6 +255,14 @@ static const char *fs__mountpoint(int idx) if (fs->found) return (const char *)fs->path; + /* the mount point was already checked for the mount point + * but and did not exist, so return NULL to avoid scanning again. + * This makes the found and not found paths cost equivalent + * in case of multiple calls. + */ + if (fs->checked) + return NULL; + return fs__get_mountpoint(fs); } diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h index 936edb95e1f3..aa222ca30311 100644 --- a/tools/lib/api/fs/fs.h +++ b/tools/lib/api/fs/fs.h @@ -18,6 +18,18 @@ const char *name##__mount(void); \ bool name##__configured(void); \ +/* + * The xxxx__mountpoint() entry points find the first match mount point for each + * filesystems listed below, where xxxx is the filesystem type. + * + * The interface is as follows: + * + * - If a mount point is found on first call, it is cached and used for all + * subsequent calls. + * + * - If a mount point is not found, NULL is returned on first call and all + * subsequent calls. + */ FS(sysfs) FS(procfs) FS(debugfs) From 04ed4ccb9c07868bc0cb41f699391332bf62220c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 Apr 2020 08:43:55 -0700 Subject: [PATCH 20/60] perf synthetic-events: save 4kb from 2 stack frames Reuse an existing char buffer to avoid two PATH_MAX sized char buffers. Reduces stack frame sizes by 4kb. perf_event__synthesize_mmap_events before 'sub $0x45b8,%rsp' after 'sub $0x35b8,%rsp'. perf_event__get_comm_ids before 'sub $0x2028,%rsp' after 'sub $0x1028,%rsp'. The performance impact of this change is negligible. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Andrey Zhizhikin Cc: Jiri Olsa Cc: Kan Liang Cc: Kefeng Wang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lore.kernel.org/lkml/20200402154357.107873-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/synthetic-events.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index a661b122d9d8..9d4aa951eaa6 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -71,7 +71,6 @@ int perf_tool__process_synth_event(struct perf_tool *tool, static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len, pid_t *tgid, pid_t *ppid) { - char filename[PATH_MAX]; char bf[4096]; int fd; size_t size = 0; @@ -81,11 +80,11 @@ static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len, *tgid = -1; *ppid = -1; - snprintf(filename, sizeof(filename), "/proc/%d/status", pid); + snprintf(bf, sizeof(bf), "/proc/%d/status", pid); - fd = open(filename, O_RDONLY); + fd = open(bf, O_RDONLY); if (fd < 0) { - pr_debug("couldn't open %s\n", filename); + pr_debug("couldn't open %s\n", bf); return -1; } @@ -281,9 +280,9 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, struct machine *machine, bool mmap_data) { - char filename[PATH_MAX]; FILE *fp; unsigned long long t; + char bf[BUFSIZ]; bool truncation = false; unsigned long long timeout = proc_map_timeout * 1000000ULL; int rc = 0; @@ -293,15 +292,15 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, if (machine__is_default_guest(machine)) return 0; - snprintf(filename, sizeof(filename), "%s/proc/%d/task/%d/maps", - machine->root_dir, pid, pid); + snprintf(bf, sizeof(bf), "%s/proc/%d/task/%d/maps", + machine->root_dir, pid, pid); - fp = fopen(filename, "r"); + fp = fopen(bf, "r"); if (fp == NULL) { /* * We raced with a task exiting - just return: */ - pr_debug("couldn't open %s\n", filename); + pr_debug("couldn't open %s\n", bf); return -1; } @@ -309,7 +308,6 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, t = rdclock(); while (1) { - char bf[BUFSIZ]; char prot[5]; char execname[PATH_MAX]; char anonstr[] = "//anon"; @@ -321,10 +319,10 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, break; if ((rdclock() - t) > timeout) { - pr_warning("Reading %s time out. " + pr_warning("Reading %s/proc/%d/task/%d/maps time out. " "You may want to increase " "the time limit by --proc-map-timeout\n", - filename); + machine->root_dir, pid, pid); truncation = true; goto out; } From aecce63e2b98f28606b063949cca06facf215d6c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 2 Apr 2020 02:03:34 +0530 Subject: [PATCH 21/60] perf expr: Add expr_ prefix for parse_ctx and parse_id Adding expr_ prefix for parse_ctx and parse_id, to straighten out the expr* namespace. There's no functional change. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anju T Sudhakar Cc: Benjamin Herrenschmidt Cc: Greg Kroah-Hartman Cc: Jin Yao Cc: Joe Mario Cc: Kajol Jain Cc: Kan Liang Cc: Madhavan Srinivasan Cc: Mamatha Inamdar Cc: Mark Rutland Cc: Michael Ellerman Cc: Michael Petlan Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20200401203340.31402-2-kjain@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/expr.c | 4 ++-- tools/perf/util/expr.c | 10 +++++----- tools/perf/util/expr.h | 12 ++++++------ tools/perf/util/expr.y | 6 +++--- tools/perf/util/stat-shadow.c | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 28313e59d6f6..ea10fc4412c4 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -6,7 +6,7 @@ #include #include -static int test(struct parse_ctx *ctx, const char *e, double val2) +static int test(struct expr_parse_ctx *ctx, const char *e, double val2) { double val; @@ -22,7 +22,7 @@ int test__expr(struct test *t __maybe_unused, int subtest __maybe_unused) const char **other; double val; int i, ret; - struct parse_ctx ctx; + struct expr_parse_ctx ctx; int num_other; expr__ctx_init(&ctx); diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c index fd192ddf93c1..c8ccc548a585 100644 --- a/tools/perf/util/expr.c +++ b/tools/perf/util/expr.c @@ -11,7 +11,7 @@ extern int expr_debug; #endif /* Caller must make sure id is allocated */ -void expr__add_id(struct parse_ctx *ctx, const char *name, double val) +void expr__add_id(struct expr_parse_ctx *ctx, const char *name, double val) { int idx; @@ -21,13 +21,13 @@ void expr__add_id(struct parse_ctx *ctx, const char *name, double val) ctx->ids[idx].val = val; } -void expr__ctx_init(struct parse_ctx *ctx) +void expr__ctx_init(struct expr_parse_ctx *ctx) { ctx->num_ids = 0; } static int -__expr__parse(double *val, struct parse_ctx *ctx, const char *expr, +__expr__parse(double *val, struct expr_parse_ctx *ctx, const char *expr, int start) { YY_BUFFER_STATE buffer; @@ -52,7 +52,7 @@ __expr__parse(double *val, struct parse_ctx *ctx, const char *expr, return ret; } -int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr) +int expr__parse(double *final_val, struct expr_parse_ctx *ctx, const char *expr) { return __expr__parse(final_val, ctx, expr, EXPR_PARSE) ? -1 : 0; } @@ -75,7 +75,7 @@ int expr__find_other(const char *expr, const char *one, const char ***other, int *num_other) { int err, i = 0, j = 0; - struct parse_ctx ctx; + struct expr_parse_ctx ctx; expr__ctx_init(&ctx); err = __expr__parse(NULL, &ctx, expr, EXPR_OTHER); diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h index 9377538f4097..b9e53f2b5844 100644 --- a/tools/perf/util/expr.h +++ b/tools/perf/util/expr.h @@ -5,19 +5,19 @@ #define EXPR_MAX_OTHER 20 #define MAX_PARSE_ID EXPR_MAX_OTHER -struct parse_id { +struct expr_parse_id { const char *name; double val; }; -struct parse_ctx { +struct expr_parse_ctx { int num_ids; - struct parse_id ids[MAX_PARSE_ID]; + struct expr_parse_id ids[MAX_PARSE_ID]; }; -void expr__ctx_init(struct parse_ctx *ctx); -void expr__add_id(struct parse_ctx *ctx, const char *id, double val); -int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr); +void expr__ctx_init(struct expr_parse_ctx *ctx); +void expr__add_id(struct expr_parse_ctx *ctx, const char *id, double val); +int expr__parse(double *final_val, struct expr_parse_ctx *ctx, const char *expr); int expr__find_other(const char *expr, const char *one, const char ***other, int *num_other); diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y index 4720cbe79357..cd17486c1c5d 100644 --- a/tools/perf/util/expr.y +++ b/tools/perf/util/expr.y @@ -15,7 +15,7 @@ %define api.pure full %parse-param { double *final_val } -%parse-param { struct parse_ctx *ctx } +%parse-param { struct expr_parse_ctx *ctx } %parse-param {void *scanner} %lex-param {void* scanner} @@ -39,14 +39,14 @@ %{ static void expr_error(double *final_val __maybe_unused, - struct parse_ctx *ctx __maybe_unused, + struct expr_parse_ctx *ctx __maybe_unused, void *scanner, const char *s) { pr_debug("%s\n", s); } -static int lookup_id(struct parse_ctx *ctx, char *id, double *val) +static int lookup_id(struct expr_parse_ctx *ctx, char *id, double *val) { int i; diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 03ecb8cd0eec..1ad5c5be7e97 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -729,7 +729,7 @@ static void generic_metric(struct perf_stat_config *config, struct runtime_stat *st) { print_metric_t print_metric = out->print_metric; - struct parse_ctx pctx; + struct expr_parse_ctx pctx; double ratio, scale; int i; void *ctxp = out->ctx; From 871f9f599db8d9d2387c0717e712af405290edea Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 2 Apr 2020 02:03:35 +0530 Subject: [PATCH 22/60] perf expr: Add expr_scanner_ctx object Add the expr_scanner_ctx object to hold user data for the expr scanner. Currently it holds only start_token, Kajol Jain will use it to hold 24x7 runtime param. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anju T Sudhakar Cc: Benjamin Herrenschmidt Cc: Greg Kroah-Hartman Cc: Jin Yao Cc: Joe Mario Cc: Kajol Jain Cc: Kan Liang Cc: Madhavan Srinivasan Cc: Mamatha Inamdar Cc: Mark Rutland Cc: Michael Ellerman Cc: Michael Petlan Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20200401203340.31402-3-kjain@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/expr.c | 6 ++++-- tools/perf/util/expr.h | 4 ++++ tools/perf/util/expr.l | 10 +++++----- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c index c8ccc548a585..c3382d58cf40 100644 --- a/tools/perf/util/expr.c +++ b/tools/perf/util/expr.c @@ -3,7 +3,6 @@ #include #include "expr.h" #include "expr-bison.h" -#define YY_EXTRA_TYPE int #include "expr-flex.h" #ifdef PARSER_DEBUG @@ -30,11 +29,14 @@ static int __expr__parse(double *val, struct expr_parse_ctx *ctx, const char *expr, int start) { + struct expr_scanner_ctx scanner_ctx = { + .start_token = start, + }; YY_BUFFER_STATE buffer; void *scanner; int ret; - ret = expr_lex_init_extra(start, &scanner); + ret = expr_lex_init_extra(&scanner_ctx, &scanner); if (ret) return ret; diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h index b9e53f2b5844..0938ad166ece 100644 --- a/tools/perf/util/expr.h +++ b/tools/perf/util/expr.h @@ -15,6 +15,10 @@ struct expr_parse_ctx { struct expr_parse_id ids[MAX_PARSE_ID]; }; +struct expr_scanner_ctx { + int start_token; +}; + void expr__ctx_init(struct expr_parse_ctx *ctx); void expr__add_id(struct expr_parse_ctx *ctx, const char *id, double val); int expr__parse(double *final_val, struct expr_parse_ctx *ctx, const char *expr); diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l index eaad29243c23..2582c2464938 100644 --- a/tools/perf/util/expr.l +++ b/tools/perf/util/expr.l @@ -76,13 +76,13 @@ sym [0-9a-zA-Z_\.:@]+ symbol {spec}*{sym}*{spec}*{sym}* %% + struct expr_scanner_ctx *sctx = expr_get_extra(yyscanner); + { - int start_token; + int start_token = sctx->start_token; - start_token = expr_get_extra(yyscanner); - - if (start_token) { - expr_set_extra(NULL, yyscanner); + if (sctx->start_token) { + sctx->start_token = 0; return start_token; } } From 47352aba40035ab3fdc50dd03a94456feabed7d8 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Thu, 2 Apr 2020 02:03:36 +0530 Subject: [PATCH 23/60] perf metrictroup: Split the metricgroup__add_metric function This patch refactors metricgroup__add_metric function where some part of it move to function metricgroup__add_metric_param. No logic change. Signed-off-by: Kajol Jain Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anju T Sudhakar Cc: Benjamin Herrenschmidt Cc: Greg Kroah-Hartman Cc: Jin Yao Cc: Joe Mario Cc: Kan Liang Cc: Madhavan Srinivasan Cc: Mamatha Inamdar Cc: Mark Rutland Cc: Michael Ellerman Cc: Michael Petlan Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20200401203340.31402-4-kjain@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 60 ++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 926449a7cdbf..7ad81c8177ea 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -485,6 +485,39 @@ static bool metricgroup__has_constraint(struct pmu_event *pe) return false; } +static int __metricgroup__add_metric(struct strbuf *events, + struct list_head *group_list, struct pmu_event *pe) +{ + + const char **ids; + int idnum; + struct egroup *eg; + + if (expr__find_other(pe->metric_expr, NULL, &ids, &idnum) < 0) + return -EINVAL; + + if (events->len > 0) + strbuf_addf(events, ","); + + if (metricgroup__has_constraint(pe)) + metricgroup__add_metric_non_group(events, ids, idnum); + else + metricgroup__add_metric_weak_group(events, ids, idnum); + + eg = malloc(sizeof(*eg)); + if (!eg) + return -ENOMEM; + + eg->ids = ids; + eg->idnum = idnum; + eg->metric_name = pe->metric_name; + eg->metric_expr = pe->metric_expr; + eg->metric_unit = pe->unit; + list_add_tail(&eg->nd, group_list); + + return 0; +} + static int metricgroup__add_metric(const char *metric, struct strbuf *events, struct list_head *group_list) { @@ -504,35 +537,12 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events, continue; if (match_metric(pe->metric_group, metric) || match_metric(pe->metric_name, metric)) { - const char **ids; - int idnum; - struct egroup *eg; pr_debug("metric expr %s for %s\n", pe->metric_expr, pe->metric_name); - if (expr__find_other(pe->metric_expr, - NULL, &ids, &idnum) < 0) - continue; - if (events->len > 0) - strbuf_addf(events, ","); - - if (metricgroup__has_constraint(pe)) - metricgroup__add_metric_non_group(events, ids, idnum); - else - metricgroup__add_metric_weak_group(events, ids, idnum); - - eg = malloc(sizeof(struct egroup)); - if (!eg) { - ret = -ENOMEM; + ret = __metricgroup__add_metric(events, group_list, pe); + if (ret == -ENOMEM) break; - } - eg->ids = ids; - eg->idnum = idnum; - eg->metric_name = pe->metric_name; - eg->metric_expr = pe->metric_expr; - eg->metric_unit = pe->unit; - list_add_tail(&eg->nd, group_list); - ret = 0; } } return ret; From 5287f926920688e1151741d49da37a533ccf1960 Mon Sep 17 00:00:00 2001 From: Andreas Gerstmayr Date: Fri, 20 Mar 2020 16:13:48 +0100 Subject: [PATCH 24/60] perf script: Add flamegraph.py script This script works in tandem with d3-flame-graph to generate flame graphs from perf. It supports two output formats: JSON and HTML (the default). The HTML format will look for a standalone d3-flame-graph template file in /usr/share/d3-flame-graph/d3-flamegraph-base.html and fill in the collected stacks. Usage: perf record -a -g -F 99 sleep 60 perf script report flamegraph Combined: perf script flamegraph -a -F 99 sleep 60 Committer testing: Tested both with "PYTHON=python3" and with the default, that uses python2-devel: Complete set of instructions: $ mkdir /tmp/build/perf $ make PYTHON=python3 -C tools/perf O=/tmp/build/perf install-bin $ export PATH=~/bin:$PATH $ perf record -a -g -F 99 sleep 60 $ perf script report flamegraph Now go and open the generated flamegraph.html file in a browser. At first this required building with PYTHON=python3, but after I reported this Andreas was kind enough to send a patch making it work with both python and python3. Signed-off-by: Andreas Gerstmayr Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Brendan Gregg Cc: Martin Spier Link: http://lore.kernel.org/lkml/20200320151355.66302-1-agerstmayr@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/scripts/python/bin/flamegraph-record | 2 + .../perf/scripts/python/bin/flamegraph-report | 3 + tools/perf/scripts/python/flamegraph.py | 124 ++++++++++++++++++ 3 files changed, 129 insertions(+) create mode 100755 tools/perf/scripts/python/bin/flamegraph-record create mode 100755 tools/perf/scripts/python/bin/flamegraph-report create mode 100755 tools/perf/scripts/python/flamegraph.py diff --git a/tools/perf/scripts/python/bin/flamegraph-record b/tools/perf/scripts/python/bin/flamegraph-record new file mode 100755 index 000000000000..725d66e71570 --- /dev/null +++ b/tools/perf/scripts/python/bin/flamegraph-record @@ -0,0 +1,2 @@ +#!/usr/bin/sh +perf record -g "$@" diff --git a/tools/perf/scripts/python/bin/flamegraph-report b/tools/perf/scripts/python/bin/flamegraph-report new file mode 100755 index 000000000000..b1a79afd903b --- /dev/null +++ b/tools/perf/scripts/python/bin/flamegraph-report @@ -0,0 +1,3 @@ +#!/usr/bin/sh +# description: create flame graphs +perf script -s "$PERF_EXEC_PATH"/scripts/python/flamegraph.py -- "$@" diff --git a/tools/perf/scripts/python/flamegraph.py b/tools/perf/scripts/python/flamegraph.py new file mode 100755 index 000000000000..61f3be9add6b --- /dev/null +++ b/tools/perf/scripts/python/flamegraph.py @@ -0,0 +1,124 @@ +# flamegraph.py - create flame graphs from perf samples +# SPDX-License-Identifier: GPL-2.0 +# +# Usage: +# +# perf record -a -g -F 99 sleep 60 +# perf script report flamegraph +# +# Combined: +# +# perf script flamegraph -a -F 99 sleep 60 +# +# Written by Andreas Gerstmayr +# Flame Graphs invented by Brendan Gregg +# Works in tandem with d3-flame-graph by Martin Spier + +from __future__ import print_function +import sys +import os +import argparse +import json + + +class Node: + def __init__(self, name, libtype=""): + self.name = name + self.libtype = libtype + self.value = 0 + self.children = [] + + def toJSON(self): + return { + "n": self.name, + "l": self.libtype, + "v": self.value, + "c": self.children + } + + +class FlameGraphCLI: + def __init__(self, args): + self.args = args + self.stack = Node("root") + + if self.args.format == "html" and \ + not os.path.isfile(self.args.template): + print("Flame Graph template {} does not exist. Please install " + "the js-d3-flame-graph (RPM) or libjs-d3-flame-graph (deb) " + "package, specify an existing flame graph template " + "(--template PATH) or another output format " + "(--format FORMAT).".format(self.args.template), + file=sys.stderr) + sys.exit(1) + + def find_or_create_node(self, node, name, dso): + libtype = "kernel" if dso == "[kernel.kallsyms]" else "" + if name is None: + name = "[unknown]" + + for child in node.children: + if child.name == name and child.libtype == libtype: + return child + + child = Node(name, libtype) + node.children.append(child) + return child + + def process_event(self, event): + node = self.find_or_create_node(self.stack, event["comm"], None) + if "callchain" in event: + for entry in reversed(event['callchain']): + node = self.find_or_create_node( + node, entry.get("sym", {}).get("name"), event.get("dso")) + else: + node = self.find_or_create_node( + node, entry.get("symbol"), event.get("dso")) + node.value += 1 + + def trace_end(self): + json_str = json.dumps(self.stack, default=lambda x: x.toJSON()) + + if self.args.format == "html": + try: + with open(self.args.template) as f: + output_str = f.read().replace("/** @flamegraph_json **/", + json_str) + except IOError as e: + print("Error reading template file: {}".format(e), file=sys.stderr) + sys.exit(1) + output_fn = self.args.output or "flamegraph.html" + else: + output_str = json_str + output_fn = self.args.output or "stacks.json" + + if output_fn == "-": + sys.stdout.write(output_str) + else: + print("dumping data to {}".format(output_fn)) + try: + with open(output_fn, "w") as out: + out.write(output_str) + except IOError as e: + print("Error writing output file: {}".format(e), file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Create flame graphs.") + parser.add_argument("-f", "--format", + default="html", choices=["json", "html"], + help="output file format") + parser.add_argument("-o", "--output", + help="output file name") + parser.add_argument("--template", + default="/usr/share/d3-flame-graph/d3-flamegraph-base.html", + help="path to flamegraph HTML template") + parser.add_argument("-i", "--input", + help=argparse.SUPPRESS) + + args = parser.parse_args() + cli = FlameGraphCLI(args) + + process_event = cli.process_event + trace_end = cli.trace_end From 853f37d75c44c305f750d8c4a34d83f03b610fce Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:15:58 +0300 Subject: [PATCH 25/60] perf auxtrace: Add ->evsel_is_auxtrace() callback Add ->evsel_is_auxtrace() callback to identify if a selected event is an AUX area event. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: Kim Phillips Cc: Mathieu Poirier Cc: Thomas Richter Link: http://lore.kernel.org/lkml/20200401101613.6201-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/auxtrace.c | 9 +++++++++ tools/perf/util/auxtrace.h | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 3571ce72ca28..2c4ad6838766 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -2577,3 +2577,12 @@ void auxtrace__free(struct perf_session *session) return session->auxtrace->free(session); } + +bool auxtrace__evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + if (!session->auxtrace || !session->auxtrace->evsel_is_auxtrace) + return false; + + return session->auxtrace->evsel_is_auxtrace(session, evsel); +} diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index e58ef160b599..db65aae5c2ea 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -21,6 +21,7 @@ union perf_event; struct perf_session; struct evlist; +struct evsel; struct perf_tool; struct mmap; struct perf_sample; @@ -166,6 +167,8 @@ struct auxtrace { struct perf_tool *tool); void (*free_events)(struct perf_session *session); void (*free)(struct perf_session *session); + bool (*evsel_is_auxtrace)(struct perf_session *session, + struct evsel *evsel); }; /** @@ -584,6 +587,8 @@ void auxtrace__dump_auxtrace_sample(struct perf_session *session, int auxtrace__flush_events(struct perf_session *session, struct perf_tool *tool); void auxtrace__free_events(struct perf_session *session); void auxtrace__free(struct perf_session *session); +bool auxtrace__evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel); #define ITRACE_HELP \ " i: synthesize instructions events\n" \ @@ -749,6 +754,13 @@ void auxtrace_index__free(struct list_head *head __maybe_unused) { } +static inline +bool auxtrace__evsel_is_auxtrace(struct perf_session *session __maybe_unused, + struct evsel *evsel __maybe_unused) +{ + return false; +} + static inline int auxtrace_parse_filters(struct evlist *evlist __maybe_unused) { From 6b52bb07c397af274850deb9e4e054bdb6261e73 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:15:59 +0300 Subject: [PATCH 26/60] perf intel-pt: Implement ->evsel_is_auxtrace() callback Implement ->evsel_is_auxtrace() callback. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 23c8289c2472..db25c77d82f3 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -2715,6 +2715,15 @@ static void intel_pt_free(struct perf_session *session) free(pt); } +static bool intel_pt_evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt, + auxtrace); + + return evsel->core.attr.type == pt->pmu_type; +} + static int intel_pt_process_auxtrace_event(struct perf_session *session, union perf_event *event, struct perf_tool *tool __maybe_unused) @@ -3310,6 +3319,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event, pt->auxtrace.flush_events = intel_pt_flush; pt->auxtrace.free_events = intel_pt_free_events; pt->auxtrace.free = intel_pt_free; + pt->auxtrace.evsel_is_auxtrace = intel_pt_evsel_is_auxtrace; session->auxtrace = &pt->auxtrace; if (dump_trace) From 966246f597deafbcb1d8c126865b4efdc2be776e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:00 +0300 Subject: [PATCH 27/60] perf intel-bts: Implement ->evsel_is_auxtrace() callback Implement ->evsel_is_auxtrace() callback. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-bts.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 34cb380d19a3..059e1c805ed0 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -728,6 +728,15 @@ static void intel_bts_free(struct perf_session *session) free(bts); } +static bool intel_bts_evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts, + auxtrace); + + return evsel->core.attr.type == bts->pmu_type; +} + struct intel_bts_synth { struct perf_tool dummy_tool; struct perf_session *session; @@ -883,6 +892,7 @@ int intel_bts_process_auxtrace_info(union perf_event *event, bts->auxtrace.flush_events = intel_bts_flush; bts->auxtrace.free_events = intel_bts_free_events; bts->auxtrace.free = intel_bts_free; + bts->auxtrace.evsel_is_auxtrace = intel_bts_evsel_is_auxtrace; session->auxtrace = &bts->auxtrace; intel_bts_print_info(&auxtrace_info->priv[0], INTEL_BTS_PMU_TYPE, From 508c71e3f90e4ceee7516d691355a36660a3e5bf Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:01 +0300 Subject: [PATCH 28/60] perf arm-spe: Implement ->evsel_is_auxtrace() callback Implement ->evsel_is_auxtrace() callback. Signed-off-by: Adrian Hunter Reviewed-by: Leo Yan Cc: Andi Kleen Cc: Jiri Olsa Cc: Kim Phillips Link: http://lore.kernel.org/lkml/20200401101613.6201-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/arm-spe.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 53be12b23ff4..875a0dd540e5 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -176,6 +176,14 @@ static void arm_spe_free(struct perf_session *session) free(spe); } +static bool arm_spe_evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace); + + return evsel->core.attr.type == spe->pmu_type; +} + static const char * const arm_spe_info_fmts[] = { [ARM_SPE_PMU_TYPE] = " PMU Type %"PRId64"\n", }; @@ -218,6 +226,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event, spe->auxtrace.flush_events = arm_spe_flush; spe->auxtrace.free_events = arm_spe_free_events; spe->auxtrace.free = arm_spe_free; + spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace; session->auxtrace = &spe->auxtrace; arm_spe_print_info(&auxtrace_info->priv[0]); From a58ab57caad02b0d854969e191b5d1d4b0f90930 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:02 +0300 Subject: [PATCH 29/60] perf cs-etm: Implement ->evsel_is_auxtrace() callback Implement ->evsel_is_auxtrace() callback. Signed-off-by: Adrian Hunter Reviewed-by: Mathieu Poirier Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 62d2f9b9ce1b..3c802fde4954 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -631,6 +631,16 @@ static void cs_etm__free(struct perf_session *session) zfree(&aux); } +static bool cs_etm__evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + struct cs_etm_auxtrace *aux = container_of(session->auxtrace, + struct cs_etm_auxtrace, + auxtrace); + + return evsel->core.attr.type == aux->pmu_type; +} + static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address) { struct machine *machine; @@ -2618,6 +2628,7 @@ int cs_etm__process_auxtrace_info(union perf_event *event, etm->auxtrace.flush_events = cs_etm__flush_events; etm->auxtrace.free_events = cs_etm__free_events; etm->auxtrace.free = cs_etm__free; + etm->auxtrace.evsel_is_auxtrace = cs_etm__evsel_is_auxtrace; session->auxtrace = &etm->auxtrace; etm->unknown_thread = thread__new(999999999, 999999999); From 113fcb46cfd557c549ab6bd9a1d43fda2c3a488c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:03 +0300 Subject: [PATCH 30/60] perf s390-cpumsf: Implement ->evsel_is_auxtrace() callback Implement ->evsel_is_auxtrace() callback. Signed-off-by: Adrian Hunter Acked-by: Thomas Richter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-7-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/s390-cpumcf-kernel.h | 1 + tools/perf/util/s390-cpumsf.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/tools/perf/util/s390-cpumcf-kernel.h b/tools/perf/util/s390-cpumcf-kernel.h index d4356030b504..f55ca07f3ca1 100644 --- a/tools/perf/util/s390-cpumcf-kernel.h +++ b/tools/perf/util/s390-cpumcf-kernel.h @@ -11,6 +11,7 @@ #define S390_CPUMCF_DIAG_DEF 0xfeef /* Counter diagnostic entry ID */ #define PERF_EVENT_CPUM_CF_DIAG 0xBC000 /* Event: Counter sets */ +#define PERF_EVENT_CPUM_SF_DIAG 0xBD000 /* Event: Combined-sampling */ struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ unsigned int def:16; /* 0-15 Data Entry Format */ diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index 6785cd87aa4d..d7779e48652f 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -1047,6 +1047,14 @@ static void s390_cpumsf_free(struct perf_session *session) free(sf); } +static bool +s390_cpumsf_evsel_is_auxtrace(struct perf_session *session __maybe_unused, + struct evsel *evsel) +{ + return evsel->core.attr.type == PERF_TYPE_RAW && + evsel->core.attr.config == PERF_EVENT_CPUM_SF_DIAG; +} + static int s390_cpumsf_get_type(const char *cpuid) { int ret, family = 0; @@ -1142,6 +1150,7 @@ int s390_cpumsf_process_auxtrace_info(union perf_event *event, sf->auxtrace.flush_events = s390_cpumsf_flush; sf->auxtrace.free_events = s390_cpumsf_free_events; sf->auxtrace.free = s390_cpumsf_free; + sf->auxtrace.evsel_is_auxtrace = s390_cpumsf_evsel_is_auxtrace; session->auxtrace = &sf->auxtrace; if (dump_trace) From 5c7bec0c9c543733fa96fe500e4c245be6f9fd30 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:04 +0300 Subject: [PATCH 31/60] perf auxtrace: For reporting purposes, un-group AUX area event An AUX area event must be the group leader when recording traces in sample mode, but that does not produce the expected results from 'perf report' because it expects the leader to provide samples. Rather than teach 'perf report' about AUX area sampling, un-group the AUX area event during processing, making the 2nd event the leader. Example: $ perf record -e '{intel_pt//u,branch-misses:u}' -c 1 uname Linux [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.080 MB perf.data ] Before: $ perf report Samples: 800 of events 'anon group { intel_pt//u, branch-misses:u }', Event count (approx.): 800 Children Self Command Shared Object Symbol 0.00% 47.50% 0.00% 47.50% uname libc-2.28.so [.] _dl_addr 0.00% 16.38% 0.00% 16.38% uname ld-2.28.so [.] __GI___tunables_init 0.00% 54.75% 0.00% 4.75% uname ld-2.28.so [.] dl_main 0.00% 3.12% 0.00% 3.12% uname ld-2.28.so [.] _dl_map_object_from_fd 0.00% 2.38% 0.00% 2.38% uname ld-2.28.so [.] strcmp 0.00% 2.25% 0.00% 2.25% uname ld-2.28.so [.] _dl_check_map_versions 0.00% 2.00% 0.00% 2.00% uname ld-2.28.so [.] _dl_important_hwcaps 0.00% 2.00% 0.00% 2.00% uname ld-2.28.so [.] _dl_map_object_deps 0.00% 51.50% 0.00% 1.50% uname ld-2.28.so [.] _dl_sysdep_start 0.00% 1.25% 0.00% 1.25% uname ld-2.28.so [.] _dl_load_cache_lookup 0.00% 51.12% 0.00% 1.12% uname ld-2.28.so [.] _dl_start 0.00% 50.88% 0.00% 1.12% uname ld-2.28.so [.] do_lookup_x 0.00% 50.62% 0.00% 1.00% uname ld-2.28.so [.] _dl_lookup_symbol_x 0.00% 1.00% 0.00% 1.00% uname ld-2.28.so [.] _dl_map_object 0.00% 1.00% 0.00% 1.00% uname ld-2.28.so [.] _dl_next_ld_env_entry 0.00% 0.88% 0.00% 0.88% uname ld-2.28.so [.] _dl_cache_libcmp 0.00% 0.88% 0.00% 0.88% uname ld-2.28.so [.] _dl_new_object 0.00% 50.88% 0.00% 0.88% uname ld-2.28.so [.] _dl_relocate_object 0.00% 0.62% 0.00% 0.62% uname ld-2.28.so [.] _dl_init_paths 0.00% 0.62% 0.00% 0.62% uname ld-2.28.so [.] _dl_name_match_p 0.00% 0.50% 0.00% 0.50% uname ld-2.28.so [.] get_common_indeces.constprop.1 0.00% 0.50% 0.00% 0.50% uname ld-2.28.so [.] memmove 0.00% 0.50% 0.00% 0.50% uname ld-2.28.so [.] memset 0.00% 0.50% 0.00% 0.50% uname ld-2.28.so [.] open_verify.constprop.11 0.00% 0.38% 0.00% 0.38% uname ld-2.28.so [.] _dl_check_all_versions 0.00% 0.38% 0.00% 0.38% uname ld-2.28.so [.] _dl_find_dso_for_object 0.00% 0.38% 0.00% 0.38% uname ld-2.28.so [.] init_tls 0.00% 0.25% 0.00% 0.25% uname ld-2.28.so [.] __tunable_get_val 0.00% 0.25% 0.00% 0.25% uname ld-2.28.so [.] _dl_add_to_namespace_list 0.00% 0.25% 0.00% 0.25% uname ld-2.28.so [.] _dl_determine_tlsoffset 0.00% 0.25% 0.00% 0.25% uname ld-2.28.so [.] _dl_discover_osversion 0.00% 0.25% 0.00% 0.25% uname ld-2.28.so [.] calloc@plt 0.00% 0.25% 0.00% 0.25% uname ld-2.28.so [.] malloc 0.00% 0.25% 0.00% 0.25% uname ld-2.28.so [.] malloc@plt 0.00% 0.25% 0.00% 0.25% uname libc-2.28.so [.] _nl_load_locale_from_archive 0.00% 0.25% 0.00% 0.25% uname [unknown] [k] 0xffffffffa3a00010 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] __libc_scratch_buffer_set_array_size 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] _dl_allocate_tls_storage 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] _dl_catch_exception 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] _dl_setup_hash 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] _dl_sort_maps 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] _dl_sysdep_read_whole_file 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] access 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] calloc 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] mmap64 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] openaux 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] rtld_lock_default_lock_recursive 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] rtld_lock_default_unlock_recursive 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] strchr 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] strlen 0.00% 0.12% 0.00% 0.12% uname ld-2.28.so [.] 0x0000000000001080 0.00% 0.12% 0.00% 0.12% uname libc-2.28.so [.] __strchrnul_avx2 0.00% 0.12% 0.00% 0.12% uname libc-2.28.so [.] _nl_normalize_codeset 0.00% 0.12% 0.00% 0.12% uname libc-2.28.so [.] malloc 0.00% 0.12% 0.00% 0.12% uname [unknown] [k] 0xffffffffa3a011f0 0.00% 50.00% 0.00% 0.00% uname ld-2.28.so [.] _dl_start_user 0.00% 50.00% 0.00% 0.00% uname [unknown] [.] 0000000000000000 After: Samples: 800 of event 'branch-misses:u', Event count (approx.): 800 Children Self Command Shared Object Symbol 54.75% 4.75% uname ld-2.28.so [.] dl_main 51.50% 1.50% uname ld-2.28.so [.] _dl_sysdep_start 51.12% 1.12% uname ld-2.28.so [.] _dl_start 50.88% 0.88% uname ld-2.28.so [.] _dl_relocate_object 50.88% 1.12% uname ld-2.28.so [.] do_lookup_x 50.62% 1.00% uname ld-2.28.so [.] _dl_lookup_symbol_x 50.00% 0.00% uname ld-2.28.so [.] _dl_start_user 50.00% 0.00% uname [unknown] [.] 0000000000000000 47.50% 47.50% uname libc-2.28.so [.] _dl_addr 16.38% 16.38% uname ld-2.28.so [.] __GI___tunables_init 3.12% 3.12% uname ld-2.28.so [.] _dl_map_object_from_fd 2.38% 2.38% uname ld-2.28.so [.] strcmp 2.25% 2.25% uname ld-2.28.so [.] _dl_check_map_versions 2.00% 2.00% uname ld-2.28.so [.] _dl_important_hwcaps 2.00% 2.00% uname ld-2.28.so [.] _dl_map_object_deps 1.25% 1.25% uname ld-2.28.so [.] _dl_load_cache_lookup 1.00% 1.00% uname ld-2.28.so [.] _dl_map_object 1.00% 1.00% uname ld-2.28.so [.] _dl_next_ld_env_entry 0.88% 0.88% uname ld-2.28.so [.] _dl_cache_libcmp 0.88% 0.88% uname ld-2.28.so [.] _dl_new_object 0.62% 0.62% uname ld-2.28.so [.] _dl_init_paths 0.62% 0.62% uname ld-2.28.so [.] _dl_name_match_p 0.50% 0.50% uname ld-2.28.so [.] get_common_indeces.constprop.1 0.50% 0.50% uname ld-2.28.so [.] memmove 0.50% 0.50% uname ld-2.28.so [.] memset 0.50% 0.50% uname ld-2.28.so [.] open_verify.constprop.11 0.38% 0.38% uname ld-2.28.so [.] _dl_check_all_versions 0.38% 0.38% uname ld-2.28.so [.] _dl_find_dso_for_object 0.38% 0.38% uname ld-2.28.so [.] init_tls 0.25% 0.25% uname ld-2.28.so [.] __tunable_get_val 0.25% 0.25% uname ld-2.28.so [.] _dl_add_to_namespace_list 0.25% 0.25% uname ld-2.28.so [.] _dl_determine_tlsoffset 0.25% 0.25% uname ld-2.28.so [.] _dl_discover_osversion 0.25% 0.25% uname ld-2.28.so [.] calloc@plt 0.25% 0.25% uname ld-2.28.so [.] malloc 0.25% 0.25% uname ld-2.28.so [.] malloc@plt 0.25% 0.25% uname libc-2.28.so [.] _nl_load_locale_from_archive 0.25% 0.25% uname [unknown] [k] 0xffffffffa3a00010 0.12% 0.12% uname ld-2.28.so [.] __libc_scratch_buffer_set_array_size 0.12% 0.12% uname ld-2.28.so [.] _dl_allocate_tls_storage 0.12% 0.12% uname ld-2.28.so [.] _dl_catch_exception 0.12% 0.12% uname ld-2.28.so [.] _dl_setup_hash 0.12% 0.12% uname ld-2.28.so [.] _dl_sort_maps 0.12% 0.12% uname ld-2.28.so [.] _dl_sysdep_read_whole_file 0.12% 0.12% uname ld-2.28.so [.] access 0.12% 0.12% uname ld-2.28.so [.] calloc 0.12% 0.12% uname ld-2.28.so [.] mmap64 0.12% 0.12% uname ld-2.28.so [.] openaux 0.12% 0.12% uname ld-2.28.so [.] rtld_lock_default_lock_recursive 0.12% 0.12% uname ld-2.28.so [.] rtld_lock_default_unlock_recursive 0.12% 0.12% uname ld-2.28.so [.] strchr 0.12% 0.12% uname ld-2.28.so [.] strlen 0.12% 0.12% uname ld-2.28.so [.] 0x0000000000001080 0.12% 0.12% uname libc-2.28.so [.] __strchrnul_avx2 0.12% 0.12% uname libc-2.28.so [.] _nl_normalize_codeset 0.12% 0.12% uname libc-2.28.so [.] malloc 0.12% 0.12% uname [unknown] [k] 0xffffffffa3a011f0 Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-8-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/auxtrace.c | 60 ++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 2c4ad6838766..b60bae8e395c 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -1234,29 +1234,79 @@ out_free: return err; } +static void unleader_evsel(struct evlist *evlist, struct evsel *leader) +{ + struct evsel *new_leader = NULL; + struct evsel *evsel; + + /* Find new leader for the group */ + evlist__for_each_entry(evlist, evsel) { + if (evsel->leader != leader || evsel == leader) + continue; + if (!new_leader) + new_leader = evsel; + evsel->leader = new_leader; + } + + /* Update group information */ + if (new_leader) { + zfree(&new_leader->group_name); + new_leader->group_name = leader->group_name; + leader->group_name = NULL; + + new_leader->core.nr_members = leader->core.nr_members - 1; + leader->core.nr_members = 1; + } +} + +static void unleader_auxtrace(struct perf_session *session) +{ + struct evsel *evsel; + + evlist__for_each_entry(session->evlist, evsel) { + if (auxtrace__evsel_is_auxtrace(session, evsel) && + perf_evsel__is_group_leader(evsel)) { + unleader_evsel(session->evlist, evsel); + } + } +} + int perf_event__process_auxtrace_info(struct perf_session *session, union perf_event *event) { enum auxtrace_type type = event->auxtrace_info.type; + int err; if (dump_trace) fprintf(stdout, " type: %u\n", type); switch (type) { case PERF_AUXTRACE_INTEL_PT: - return intel_pt_process_auxtrace_info(event, session); + err = intel_pt_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_INTEL_BTS: - return intel_bts_process_auxtrace_info(event, session); + err = intel_bts_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_ARM_SPE: - return arm_spe_process_auxtrace_info(event, session); + err = arm_spe_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_CS_ETM: - return cs_etm__process_auxtrace_info(event, session); + err = cs_etm__process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_S390_CPUMSF: - return s390_cpumsf_process_auxtrace_info(event, session); + err = s390_cpumsf_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_UNKNOWN: default: return -EINVAL; } + + if (err) + return err; + + unleader_auxtrace(session); + + return 0; } s64 perf_event__process_auxtrace(struct perf_session *session, From 1c5c25b3fdbd7035f6d53a1a99b5afd577ce13e1 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:05 +0300 Subject: [PATCH 32/60] perf auxtrace: Add an option to synthesize callchains for regular events Currently, callchains can be synthesized only for synthesized events. Add an itrace option to synthesize callchains for regular events. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-9-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/itrace.txt | 1 + tools/perf/builtin-report.c | 3 ++- tools/perf/builtin-script.c | 2 +- tools/perf/util/auxtrace.c | 6 +++++- tools/perf/util/auxtrace.h | 2 ++ tools/perf/util/s390-cpumsf.c | 2 +- 6 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt index 82ff7dad40c2..671e154ede03 100644 --- a/tools/perf/Documentation/itrace.txt +++ b/tools/perf/Documentation/itrace.txt @@ -10,6 +10,7 @@ e synthesize error events d create a debug log g synthesize a call chain (use with i or x) + G synthesize a call chain on existing event records l synthesize last branch entries (use with i or x) s skip initial number of events diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 26d8fc27e427..c0cebd53ecf9 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -339,6 +339,7 @@ static int report__setup_sample_type(struct report *rep) bool is_pipe = perf_data__is_pipe(session->data); if (session->itrace_synth_opts->callchain || + session->itrace_synth_opts->add_callchain || (!is_pipe && perf_header__has_feat(&session->header, HEADER_AUXTRACE) && !session->itrace_synth_opts->set)) @@ -1332,7 +1333,7 @@ int cmd_report(int argc, const char **argv) if (symbol_conf.cumulate_callchain && !callchain_param.order_set) callchain_param.order = ORDER_CALLER; - if (itrace_synth_opts.callchain && + if ((itrace_synth_opts.callchain || itrace_synth_opts.add_callchain) && (int)itrace_synth_opts.callchain_sz > report.max_stack) report.max_stack = itrace_synth_opts.callchain_sz; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 8bf3ba280312..06b511c0a539 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3537,7 +3537,7 @@ int cmd_script(int argc, const char **argv) return -1; } - if (itrace_synth_opts.callchain && + if ((itrace_synth_opts.callchain || itrace_synth_opts.add_callchain) && itrace_synth_opts.callchain_sz > scripting_max_stack) scripting_max_stack = itrace_synth_opts.callchain_sz; diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index b60bae8e395c..809a09e75c55 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -1462,8 +1462,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, synth_opts->branches = true; synth_opts->returns = true; break; + case 'G': case 'g': - synth_opts->callchain = true; + if (p[-1] == 'G') + synth_opts->add_callchain = true; + else + synth_opts->callchain = true; synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ; while (*p == ' ' || *p == ',') diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index db65aae5c2ea..dd8a4ff8209e 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -74,6 +74,7 @@ enum itrace_period_type { * @calls: limit branch samples to calls (can be combined with @returns) * @returns: limit branch samples to returns (can be combined with @calls) * @callchain: add callchain to 'instructions' events + * @add_callchain: add callchain to existing event records * @thread_stack: feed branches to the thread_stack * @last_branch: add branch context to 'instruction' events * @callchain_sz: maximum callchain size @@ -101,6 +102,7 @@ struct itrace_synth_opts { bool calls; bool returns; bool callchain; + bool add_callchain; bool thread_stack; bool last_branch; unsigned int callchain_sz; diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index d7779e48652f..38a942881d1a 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -1079,7 +1079,7 @@ static bool check_auxtrace_itrace(struct itrace_synth_opts *itops) itops->pwr_events || itops->errors || itops->dont_decode || itops->calls || itops->returns || itops->callchain || itops->thread_stack || - itops->last_branch; + itops->last_branch || itops->add_callchain; if (!ison) return true; pr_err("Unsupported --itrace options specified\n"); From 4fef41bfb1d8d2ada4a18eb3ab80c2682bcbae12 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:06 +0300 Subject: [PATCH 33/60] perf thread-stack: Add thread_stack__sample_late() Add a thread stack function to create a call chain for hardware events where the sample records get created some time after the event occurred. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-10-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread-stack.c | 57 ++++++++++++++++++++++++++++++++++ tools/perf/util/thread-stack.h | 3 ++ 2 files changed, 60 insertions(+) diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index 0885967d5bc3..83f6c83f5617 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -497,6 +497,63 @@ void thread_stack__sample(struct thread *thread, int cpu, chain->nr = i; } +/* + * Hardware sample records, created some time after the event occurred, need to + * have subsequent addresses removed from the call chain. + */ +void thread_stack__sample_late(struct thread *thread, int cpu, + struct ip_callchain *chain, size_t sz, + u64 sample_ip, u64 kernel_start) +{ + struct thread_stack *ts = thread__stack(thread, cpu); + u64 sample_context = callchain_context(sample_ip, kernel_start); + u64 last_context, context, ip; + size_t nr = 0, j; + + if (sz < 2) { + chain->nr = 0; + return; + } + + if (!ts) + goto out; + + /* + * When tracing kernel space, kernel addresses occur at the top of the + * call chain after the event occurred but before tracing stopped. + * Skip them. + */ + for (j = 1; j <= ts->cnt; j++) { + ip = ts->stack[ts->cnt - j].ret_addr; + context = callchain_context(ip, kernel_start); + if (context == PERF_CONTEXT_USER || + (context == sample_context && ip == sample_ip)) + break; + } + + last_context = sample_ip; /* Use sample_ip as an invalid context */ + + for (; nr < sz && j <= ts->cnt; nr++, j++) { + ip = ts->stack[ts->cnt - j].ret_addr; + context = callchain_context(ip, kernel_start); + if (context != last_context) { + if (nr >= sz - 1) + break; + chain->ips[nr++] = context; + last_context = context; + } + chain->ips[nr] = ip; + } +out: + if (nr) { + chain->nr = nr; + } else { + chain->ips[0] = sample_context; + chain->ips[1] = sample_ip; + chain->nr = 2; + } +} + struct call_return_processor * call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data), void *data) diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h index e1ec5a58f1b2..8962ddc4e1ab 100644 --- a/tools/perf/util/thread-stack.h +++ b/tools/perf/util/thread-stack.h @@ -85,6 +85,9 @@ int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip, void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr); void thread_stack__sample(struct thread *thread, int cpu, struct ip_callchain *chain, size_t sz, u64 ip, u64 kernel_start); +void thread_stack__sample_late(struct thread *thread, int cpu, + struct ip_callchain *chain, size_t sz, u64 ip, + u64 kernel_start); int thread_stack__flush(struct thread *thread); void thread_stack__free(struct thread *thread); size_t thread_stack__depth(struct thread *thread, int cpu); From 8e94b3243a9af2c49a38fd0d6f2f9beb542e41a4 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:07 +0300 Subject: [PATCH 34/60] perf evsel: Be consistent when looking which evsel PERF_SAMPLE_ bits are set Using 'type' variable for checking for callchains is equivalent to using evsel__has_callchain(evsel) and is how the other PERF_SAMPLE_ bits are checked in this function, so use it to be consistent. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-11-adrian.hunter@intel.com [ split from a larger patch ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d23db6755f51..f320adaf1326 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2136,7 +2136,7 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, } } - if (evsel__has_callchain(evsel)) { + if (type & PERF_SAMPLE_CALLCHAIN) { const u64 max_callchain_nr = UINT64_MAX / sizeof(u64); OVERFLOW_CHECK_u64(array); From e11869a065e36f3d22a575ccfb1097c262bb4f6e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:07 +0300 Subject: [PATCH 35/60] perf evsel: Add support for synthesized sample type For reporting purposes, an evsel sample can have a callchain synthesized from AUX area data. Add support for keeping track of synthesized sample types. Note, the recorded sample_type cannot be changed because it is needed to continue to parse events. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-11-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 53187c501ee8..e64ed4202cab 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -104,6 +104,14 @@ struct evsel { perf_evsel__sb_cb_t *cb; void *data; } side_band; + /* + * For reporting purposes, an evsel sample can have a callchain + * synthesized from AUX area data. Keep track of synthesized sample + * types here. Note, the recorded sample_type cannot be changed because + * it is needed to continue to parse events. + * See also evsel__has_callchain(). + */ + __u64 synth_sample_type; }; struct perf_missing_features { @@ -398,7 +406,12 @@ static inline bool perf_evsel__has_branch_hw_idx(const struct evsel *evsel) static inline bool evsel__has_callchain(const struct evsel *evsel) { - return (evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0; + /* + * For reporting purposes, an evsel sample can have a recorded callchain + * or a callchain synthesized from AUX area data. + */ + return evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN || + evsel->synth_sample_type & PERF_SAMPLE_CALLCHAIN; } struct perf_env *perf_evsel__env(struct evsel *evsel); From 2855c05cf14a5ee0d3b58168632acb11ea35721f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:08 +0300 Subject: [PATCH 36/60] perf intel-pt: Add support for synthesizing callchains for regular events Currently, callchains can be synthesized only for synthesized events. Support also synthesizing callchains for regular events. Example: # perf record --kcore --aux-sample -e '{intel_pt//,cycles}' -c 10000 uname Linux [ perf record: Woken up 3 times to write data ] [ perf record: Captured and wrote 0.532 MB perf.data ] # perf script --itrace=Ge | head -20 uname 4864 2419025.358181: 10000 cycles: ffffffffbba56965 apparmor_bprm_committing_creds+0x35 ([kernel.kallsyms]) ffffffffbc400cd5 __indirect_thunk_start+0x5 ([kernel.kallsyms]) ffffffffbba07422 security_bprm_committing_creds+0x22 ([kernel.kallsyms]) ffffffffbb89805d install_exec_creds+0xd ([kernel.kallsyms]) ffffffffbb90d9ac load_elf_binary+0x3ac ([kernel.kallsyms]) uname 4864 2419025.358185: 10000 cycles: ffffffffbba56db0 apparmor_bprm_committed_creds+0x20 ([kernel.kallsyms]) ffffffffbc400cd5 __indirect_thunk_start+0x5 ([kernel.kallsyms]) ffffffffbba07452 security_bprm_committed_creds+0x22 ([kernel.kallsyms]) ffffffffbb89809a install_exec_creds+0x4a ([kernel.kallsyms]) ffffffffbb90d9ac load_elf_binary+0x3ac ([kernel.kallsyms]) uname 4864 2419025.358189: 10000 cycles: ffffffffbb86fdf6 vma_adjust_trans_huge+0x6 ([kernel.kallsyms]) ffffffffbb821660 __vma_adjust+0x160 ([kernel.kallsyms]) ffffffffbb897be7 shift_arg_pages+0x97 ([kernel.kallsyms]) ffffffffbb897ed9 setup_arg_pages+0x1e9 ([kernel.kallsyms]) ffffffffbb90d9f2 load_elf_binary+0x3f2 ([kernel.kallsyms]) Committer testing: # perf record --kcore --aux-sample -e '{intel_pt//,cycles}' -c 10000 uname Linux [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.233 MB perf.data ] # Then, before this patch: # perf script --itrace=Ge | head -20 uname 28642 168664.856384: 10000 cycles: ffffffff9810aeaa commit_creds+0x2a ([kernel.kallsyms]) uname 28642 168664.856388: 10000 cycles: ffffffff982a24f1 mprotect_fixup+0x151 ([kernel.kallsyms]) uname 28642 168664.856392: 10000 cycles: ffffffff982a385b move_page_tables+0xbcb ([kernel.kallsyms]) uname 28642 168664.856396: 10000 cycles: ffffffff982fd4ec __mod_memcg_state+0x1c ([kernel.kallsyms]) uname 28642 168664.856400: 10000 cycles: ffffffff9829fddd do_mmap+0xfd ([kernel.kallsyms]) uname 28642 168664.856404: 10000 cycles: ffffffff9829c879 __vma_adjust+0x479 ([kernel.kallsyms]) uname 28642 168664.856408: 10000 cycles: ffffffff98238e94 __perf_addr_filters_adjust+0x34 ([kernel.kallsyms]) uname 28642 168664.856412: 10000 cycles: ffffffff98a38e0b down_write+0x1b ([kernel.kallsyms]) uname 28642 168664.856416: 10000 cycles: ffffffff983006a0 memcg_kmem_get_cache+0x0 ([kernel.kallsyms]) uname 28642 168664.856421: 10000 cycles: ffffffff98396eaf load_elf_binary+0x92f ([kernel.kallsyms]) uname 28642 168664.856425: 10000 cycles: ffffffff982e0222 kfree+0x62 ([kernel.kallsyms]) uname 28642 168664.856428: 10000 cycles: ffffffff9846dfd4 file_has_perm+0x54 ([kernel.kallsyms]) uname 28642 168664.856433: 10000 cycles: ffffffff98288911 vma_interval_tree_insert+0x51 ([kernel.kallsyms]) uname 28642 168664.856437: 10000 cycles: ffffffff9823e577 perf_event_mmap_output+0x27 ([kernel.kallsyms]) uname 28642 168664.856441: 10000 cycles: ffffffff98a26fa0 xas_load+0x40 ([kernel.kallsyms]) uname 28642 168664.856445: 10000 cycles: ffffffff98004f30 arch_setup_additional_pages+0x0 ([kernel.kallsyms]) uname 28642 168664.856448: 10000 cycles: ffffffff98a297c0 copy_user_generic_unrolled+0xa0 ([kernel.kallsyms]) uname 28642 168664.856452: 10000 cycles: ffffffff9853a87a strnlen_user+0x10a ([kernel.kallsyms]) uname 28642 168664.856456: 10000 cycles: ffffffff986638a7 randomize_page+0x27 ([kernel.kallsyms]) uname 28642 168664.856460: 10000 cycles: ffffffff98a3b645 _raw_spin_lock+0x5 ([kernel.kallsyms]) # And after: # perf script --itrace=Ge | head -20 uname 28642 168664.856384: 10000 cycles: ffffffff9810aeaa commit_creds+0x2a ([kernel.kallsyms]) ffffffff9831fe87 install_exec_creds+0x17 ([kernel.kallsyms]) ffffffff983968d9 load_elf_binary+0x359 ([kernel.kallsyms]) ffffffff98e00c45 __x86_indirect_thunk_rax+0x5 ([kernel.kallsyms]) ffffffff98e00c45 __x86_indirect_thunk_rax+0x5 ([kernel.kallsyms]) uname 28642 168664.856388: 10000 cycles: ffffffff982a24f1 mprotect_fixup+0x151 ([kernel.kallsyms]) ffffffff9831fa83 setup_arg_pages+0x123 ([kernel.kallsyms]) ffffffff9839691f load_elf_binary+0x39f ([kernel.kallsyms]) ffffffff98e00c45 __x86_indirect_thunk_rax+0x5 ([kernel.kallsyms]) ffffffff98e00c45 __x86_indirect_thunk_rax+0x5 ([kernel.kallsyms]) uname 28642 168664.856392: 10000 cycles: ffffffff982a385b move_page_tables+0xbcb ([kernel.kallsyms]) ffffffff9831f889 shift_arg_pages+0xa9 ([kernel.kallsyms]) ffffffff9831fb4f setup_arg_pages+0x1ef ([kernel.kallsyms]) ffffffff9839691f load_elf_binary+0x39f ([kernel.kallsyms]) ffffffff98e00c45 __x86_indirect_thunk_rax+0x5 ([kernel.kallsyms]) # Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-12-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt.c | 68 ++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index db25c77d82f3..a659b4a1b3f2 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -124,6 +124,8 @@ struct intel_pt { struct range *time_ranges; unsigned int range_cnt; + + struct ip_callchain *chain; }; enum switch_state { @@ -868,6 +870,45 @@ static u64 intel_pt_ns_to_ticks(const struct intel_pt *pt, u64 ns) pt->tc.time_mult; } +static struct ip_callchain *intel_pt_alloc_chain(struct intel_pt *pt) +{ + size_t sz = sizeof(struct ip_callchain); + + /* Add 1 to callchain_sz for callchain context */ + sz += (pt->synth_opts.callchain_sz + 1) * sizeof(u64); + return zalloc(sz); +} + +static int intel_pt_callchain_init(struct intel_pt *pt) +{ + struct evsel *evsel; + + evlist__for_each_entry(pt->session->evlist, evsel) { + if (!(evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN)) + evsel->synth_sample_type |= PERF_SAMPLE_CALLCHAIN; + } + + pt->chain = intel_pt_alloc_chain(pt); + if (!pt->chain) + return -ENOMEM; + + return 0; +} + +static void intel_pt_add_callchain(struct intel_pt *pt, + struct perf_sample *sample) +{ + struct thread *thread = machine__findnew_thread(pt->machine, + sample->pid, + sample->tid); + + thread_stack__sample_late(thread, sample->cpu, pt->chain, + pt->synth_opts.callchain_sz + 1, sample->ip, + pt->kernel_start); + + sample->callchain = pt->chain; +} + static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt, unsigned int queue_nr) { @@ -880,11 +921,7 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt, return NULL; if (pt->synth_opts.callchain) { - size_t sz = sizeof(struct ip_callchain); - - /* Add 1 to callchain_sz for callchain context */ - sz += (pt->synth_opts.callchain_sz + 1) * sizeof(u64); - ptq->chain = zalloc(sz); + ptq->chain = intel_pt_alloc_chain(pt); if (!ptq->chain) goto out_free; } @@ -1992,7 +2029,8 @@ static int intel_pt_sample(struct intel_pt_queue *ptq) if (!(state->type & INTEL_PT_BRANCH)) return 0; - if (pt->synth_opts.callchain || pt->synth_opts.thread_stack) + if (pt->synth_opts.callchain || pt->synth_opts.add_callchain || + pt->synth_opts.thread_stack) thread_stack__event(ptq->thread, ptq->cpu, ptq->flags, state->from_ip, state->to_ip, ptq->insn_len, state->trace_nr); @@ -2639,6 +2677,11 @@ static int intel_pt_process_event(struct perf_session *session, if (err) return err; + if (event->header.type == PERF_RECORD_SAMPLE) { + if (pt->synth_opts.add_callchain && !sample->callchain) + intel_pt_add_callchain(pt, sample); + } + if (event->header.type == PERF_RECORD_AUX && (event->aux.flags & PERF_AUX_FLAG_TRUNCATED) && pt->synth_opts.errors) { @@ -2710,6 +2753,7 @@ static void intel_pt_free(struct perf_session *session) session->auxtrace = NULL; thread__put(pt->unknown_thread); addr_filters__exit(&pt->filts); + zfree(&pt->chain); zfree(&pt->filter); zfree(&pt->time_ranges); free(pt); @@ -3348,6 +3392,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event, !session->itrace_synth_opts->inject) { pt->synth_opts.branches = false; pt->synth_opts.callchain = true; + pt->synth_opts.add_callchain = true; } pt->synth_opts.thread_stack = session->itrace_synth_opts->thread_stack; @@ -3380,14 +3425,22 @@ int intel_pt_process_auxtrace_info(union perf_event *event, pt->branches_filter |= PERF_IP_FLAG_RETURN | PERF_IP_FLAG_TRACE_BEGIN; - if (pt->synth_opts.callchain && !symbol_conf.use_callchain) { + if ((pt->synth_opts.callchain || pt->synth_opts.add_callchain) && + !symbol_conf.use_callchain) { symbol_conf.use_callchain = true; if (callchain_register_param(&callchain_param) < 0) { symbol_conf.use_callchain = false; pt->synth_opts.callchain = false; + pt->synth_opts.add_callchain = false; } } + if (pt->synth_opts.add_callchain) { + err = intel_pt_callchain_init(pt); + if (err) + goto err_delete_thread; + } + err = intel_pt_synth_events(pt, session); if (err) goto err_delete_thread; @@ -3410,6 +3463,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event, return 0; err_delete_thread: + zfree(&pt->chain); thread__zput(pt->unknown_thread); err_free_queues: intel_pt_log_disable(); From e12ee9f7513cb5dbe8b12aac030dfbeff35b3766 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:09 +0300 Subject: [PATCH 37/60] perf evsel: Move and globalize perf_evsel__find_pmu() and perf_evsel__is_aux_event() Move and globalize 2 functions from the auxtrace specific sources so that they can be reused. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-13-adrian.hunter@intel.com [ Move to pmu.c, as moving to evsel.h breaks the python binding ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/auxtrace.c | 19 ------------------- tools/perf/util/evsel.h | 3 +++ tools/perf/util/pmu.c | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 809a09e75c55..33ad33378a90 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -58,25 +58,6 @@ #include "symbol/kallsyms.h" #include -static struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel) -{ - struct perf_pmu *pmu = NULL; - - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - if (pmu->type == evsel->core.attr.type) - break; - } - - return pmu; -} - -static bool perf_evsel__is_aux_event(struct evsel *evsel) -{ - struct perf_pmu *pmu = perf_evsel__find_pmu(evsel); - - return pmu && pmu->auxtrace; -} - /* * Make a group from 'leader' to 'last', requiring that the events were not * already grouped to a different leader. diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index e64ed4202cab..a463bc65b001 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -158,6 +158,9 @@ int perf_evsel__object_config(size_t object_size, int (*init)(struct evsel *evsel), void (*fini)(struct evsel *evsel)); +struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel); +bool perf_evsel__is_aux_event(struct evsel *evsel); + struct evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx); static inline struct evsel *evsel__new(struct perf_event_attr *attr) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index ef6a63f3d386..bc912a84b5e9 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -18,6 +18,7 @@ #include #include #include "debug.h" +#include "evsel.h" #include "pmu.h" #include "parse-events.h" #include "header.h" @@ -884,6 +885,25 @@ struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu) return NULL; } +struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel) +{ + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (pmu->type == evsel->core.attr.type) + break; + } + + return pmu; +} + +bool perf_evsel__is_aux_event(struct evsel *evsel) +{ + struct perf_pmu *pmu = perf_evsel__find_pmu(evsel); + + return pmu && pmu->auxtrace; +} + struct perf_pmu *perf_pmu__find(const char *name) { struct perf_pmu *pmu; From 5f34278867b78bed77dcbd723056244e9bfc12ef Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:10 +0300 Subject: [PATCH 38/60] perf evlist: Move leader-sampling configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move leader-sampling configuration in preparation for adding support for leader sampling with AUX area events. Committer notes: It only makes sense when configuring an evsel that is part of an evlist, so the only case where it is called outside perf_evlist__config(), in some 'perf test' entry, is safe, and even there we should just use perf_evlist__config(), but since in that case we have just one evsel in the evlist, it is equivalent. Also fixed up this problem: util/record.c: In function ‘perf_evlist__config’: util/record.c:223:3: error: too many arguments to function ‘perf_evsel__config_leader_sampling’ 223 | perf_evsel__config_leader_sampling(evsel, evlist); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ util/record.c:170:13: note: declared here 170 | static void perf_evsel__config_leader_sampling(struct evsel *evsel) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-14-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 19 ------------------- tools/perf/util/record.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index f320adaf1326..8300e8c7aea8 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1002,25 +1002,6 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, } } - /* - * Disable sampling for all group members other - * than leader in case leader 'leads' the sampling. - */ - if ((leader != evsel) && leader->sample_read) { - attr->freq = 0; - attr->sample_freq = 0; - attr->sample_period = 0; - attr->write_backward = 0; - - /* - * We don't get sample for slave events, we make them - * when delivering group leader sample. Set the slave - * event to follow the master sample_type to ease up - * report. - */ - attr->sample_type = leader->core.attr.sample_type; - } - if (opts->no_samples) attr->sample_freq = 0; diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 7def66168503..8870ae451cac 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -167,6 +167,31 @@ bool perf_can_aux_sample(void) return true; } +static void perf_evsel__config_leader_sampling(struct evsel *evsel) +{ + struct perf_event_attr *attr = &evsel->core.attr; + struct evsel *leader = evsel->leader; + + /* + * Disable sampling for all group members other + * than leader in case leader 'leads' the sampling. + */ + if (leader != evsel && leader->sample_read) { + attr->freq = 0; + attr->sample_freq = 0; + attr->sample_period = 0; + attr->write_backward = 0; + + /* + * We don't get sample for slave events, we make them + * when delivering group leader sample. Set the slave + * event to follow the master sample_type to ease up + * report. + */ + attr->sample_type = leader->core.attr.sample_type; + } +} + void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, struct callchain_param *callchain) { @@ -193,6 +218,10 @@ void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, evsel->core.attr.comm_exec = 1; } + /* Configure leader sampling here now that the sample type is known */ + evlist__for_each_entry(evlist, evsel) + perf_evsel__config_leader_sampling(evsel); + if (opts->full_auxtrace) { /* * Need to be able to synthesize and parse selected events with From 3713eb371c873e6bed713d78f1bdd5e8be0764a3 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:11 +0300 Subject: [PATCH 39/60] perf evsel: Rearrange perf_evsel__config_leader_sampling() In preparation for adding support for leader sampling with AUX area events. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-15-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/record.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 8870ae451cac..32aeeb8a8d00 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -172,24 +172,24 @@ static void perf_evsel__config_leader_sampling(struct evsel *evsel) struct perf_event_attr *attr = &evsel->core.attr; struct evsel *leader = evsel->leader; + if (leader == evsel || !leader->sample_read) + return; + /* * Disable sampling for all group members other * than leader in case leader 'leads' the sampling. */ - if (leader != evsel && leader->sample_read) { - attr->freq = 0; - attr->sample_freq = 0; - attr->sample_period = 0; - attr->write_backward = 0; + attr->freq = 0; + attr->sample_freq = 0; + attr->sample_period = 0; + attr->write_backward = 0; - /* - * We don't get sample for slave events, we make them - * when delivering group leader sample. Set the slave - * event to follow the master sample_type to ease up - * report. - */ - attr->sample_type = leader->core.attr.sample_type; - } + /* + * We don't get a sample for slave events, we make them when delivering + * the group leader sample. Set the slave event to follow the master + * sample_type to ease up reporting. + */ + attr->sample_type = leader->core.attr.sample_type; } void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, From 94d3820f2e18d3c88f833baec8d6ad5b3489fa59 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:12 +0300 Subject: [PATCH 40/60] perf evlist: Allow multiple read formats Tools find the correct evsel, and therefore read format, using the event ID, so it isn't necessary for all read formats to be the same. In the case of leader-sampling of AUX area events, dummy tracking events will have a different read format, so relax the validation to become a debug message only. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-16-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 1548237b6558..82d9f9bb8975 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1131,8 +1131,10 @@ bool perf_evlist__valid_read_format(struct evlist *evlist) u64 sample_type = first->core.attr.sample_type; evlist__for_each_entry(evlist, pos) { - if (read_format != pos->core.attr.read_format) - return false; + if (read_format != pos->core.attr.read_format) { + pr_debug("Read format differs %#" PRIx64 " vs %#" PRIx64 "\n", + read_format, (u64)pos->core.attr.read_format); + } } /* PERF_SAMPLE_READ imples PERF_FORMAT_ID. */ From e345997914a8af5e8362e884d2fee38bd2e9c6d8 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:13 +0300 Subject: [PATCH 41/60] perf tools: Add support for leader-sampling with AUX area events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When AUX area events are used in sampling mode, they must be the group leader, but the group leader is also used for leader-sampling. However, it is not desirable to use an AUX area event as the leader for leader-sampling, because it doesn't have any samples of its own. To support leader-sampling with AUX area events, use the 2nd event of the group as the "leader" for the purposes of leader-sampling. Example: # perf record --kcore --aux-sample -e '{intel_pt//,cycles,instructions}:S' -c 10000 uname [ perf record: Woken up 3 times to write data ] [ perf record: Captured and wrote 0.786 MB perf.data ] # perf report Samples: 380 of events 'anon group { cycles, instructions }', Event count (approx.): 3026164 Children Self Command Shared Object Symbol + 38.76% 42.65% 0.00% 0.00% uname [kernel.kallsyms] [k] __x86_indirect_thunk_rax + 35.82% 31.33% 0.00% 0.00% uname ld-2.28.so [.] _dl_start_user + 34.29% 29.74% 0.55% 0.47% uname ld-2.28.so [.] _dl_start + 33.73% 28.62% 1.60% 0.97% uname ld-2.28.so [.] dl_main + 33.19% 29.04% 0.52% 0.32% uname ld-2.28.so [.] _dl_sysdep_start + 27.83% 33.74% 0.00% 0.00% uname [kernel.kallsyms] [k] do_syscall_64 + 26.76% 33.29% 0.00% 0.00% uname [kernel.kallsyms] [k] entry_SYSCALL_64_after_hwframe + 23.78% 20.33% 5.97% 5.25% uname [kernel.kallsyms] [k] page_fault + 23.18% 24.60% 0.00% 0.00% uname libc-2.28.so [.] __libc_start_main + 22.64% 24.37% 0.00% 0.00% uname uname [.] _start + 21.04% 23.27% 0.00% 0.00% uname uname [.] main + 19.48% 18.08% 3.72% 3.64% uname ld-2.28.so [.] _dl_relocate_object + 19.47% 21.81% 0.00% 0.00% uname libc-2.28.so [.] setlocale + 19.44% 21.56% 0.52% 0.61% uname libc-2.28.so [.] _nl_find_locale + 17.87% 19.66% 0.00% 0.00% uname libc-2.28.so [.] _nl_load_locale_from_archive + 15.71% 13.73% 0.53% 0.52% uname [kernel.kallsyms] [k] do_page_fault + 15.18% 13.21% 1.03% 0.68% uname [kernel.kallsyms] [k] handle_mm_fault + 14.15% 12.53% 1.01% 1.12% uname [kernel.kallsyms] [k] __handle_mm_fault + 12.03% 9.67% 0.54% 0.32% uname ld-2.28.so [.] _dl_map_object + 10.55% 8.48% 0.00% 0.00% uname ld-2.28.so [.] openaux + 10.55% 20.20% 0.52% 0.61% uname libc-2.28.so [.] __run_exit_handlers Comnmitter notes: Fixed up this problem: util/record.c: In function ‘perf_evlist__config’: util/record.c:256:3: error: too few arguments to function ‘perf_evsel__config_leader_sampling’ 256 | perf_evsel__config_leader_sampling(evsel); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ util/record.c:190:13: note: declared here 190 | static void perf_evsel__config_leader_sampling(struct evsel *evsel, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-17-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 3 ++ tools/perf/util/record.c | 45 ++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 6345db33c533..cb23667531ab 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -258,6 +258,9 @@ Normally all events in an event group sample, but with :S only the first event (the leader) samples, and it only reads the values of the other events in the group. +However, in the case AUX area events (e.g. Intel PT or CoreSight), the AUX +area event must be the leader, so then the second event samples, not the first. + OPTIONS ------- diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 32aeeb8a8d00..6d3e3df6e2a1 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -167,17 +167,46 @@ bool perf_can_aux_sample(void) return true; } -static void perf_evsel__config_leader_sampling(struct evsel *evsel) +/* + * perf_evsel__config_leader_sampling() uses special rules for leader sampling. + * However, if the leader is an AUX area event, then assume the event to sample + * is the next event. + */ +static struct evsel *perf_evsel__read_sampler(struct evsel *evsel, + struct evlist *evlist) +{ + struct evsel *leader = evsel->leader; + + if (perf_evsel__is_aux_event(leader)) { + evlist__for_each_entry(evlist, evsel) { + if (evsel->leader == leader && evsel != evsel->leader) + return evsel; + } + } + + return leader; +} + +static void perf_evsel__config_leader_sampling(struct evsel *evsel, + struct evlist *evlist) { struct perf_event_attr *attr = &evsel->core.attr; struct evsel *leader = evsel->leader; + struct evsel *read_sampler; - if (leader == evsel || !leader->sample_read) + if (!leader->sample_read) + return; + + read_sampler = perf_evsel__read_sampler(evsel, evlist); + + if (evsel == read_sampler) return; /* - * Disable sampling for all group members other - * than leader in case leader 'leads' the sampling. + * Disable sampling for all group members other than the leader in + * case the leader 'leads' the sampling, except when the leader is an + * AUX area event, in which case the 2nd event in the group is the one + * that 'leads' the sampling. */ attr->freq = 0; attr->sample_freq = 0; @@ -188,8 +217,12 @@ static void perf_evsel__config_leader_sampling(struct evsel *evsel) * We don't get a sample for slave events, we make them when delivering * the group leader sample. Set the slave event to follow the master * sample_type to ease up reporting. + * An AUX area event also has sample_type requirements, so also include + * the sample type bits from the leader's sample_type to cover that + * case. */ - attr->sample_type = leader->core.attr.sample_type; + attr->sample_type = read_sampler->core.attr.sample_type | + leader->core.attr.sample_type; } void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, @@ -220,7 +253,7 @@ void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, /* Configure leader sampling here now that the sample type is known */ evlist__for_each_entry(evlist, evsel) - perf_evsel__config_leader_sampling(evsel); + perf_evsel__config_leader_sampling(evsel, evlist); if (opts->full_auxtrace) { /* From bec49a9e05db3dbdca696fa07c62c52638fb6371 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 14 Apr 2020 09:15:50 -0700 Subject: [PATCH 42/60] perf stat: Force error in fallback on :k events When it is not possible for a non-privilege perf command to monitor at the kernel level (:k), the fallback code forces a :u. That works if the event was previously monitoring both levels. But if the event was already constrained to kernel only, then it does not make sense to restrict it to user only. Given the code works by exclusion, a kernel only event would have: attr->exclude_user = 1 The fallback code would add: attr->exclude_kernel = 1 In the end the end would not monitor in either the user level or kernel level. In other words, it would count nothing. An event programmed to monitor kernel only cannot be switched to user only without seriously warning the user. This patch forces an error in this case to make it clear the request cannot really be satisfied. Behavior with paranoid 1: $ sudo bash -c "echo 1 > /proc/sys/kernel/perf_event_paranoid" $ perf stat -e cycles:k sleep 1 Performance counter stats for 'sleep 1': 1,520,413 cycles:k 1.002361664 seconds time elapsed 0.002480000 seconds user 0.000000000 seconds sys Old behavior with paranoid 2: $ sudo bash -c "echo 2 > /proc/sys/kernel/perf_event_paranoid" $ perf stat -e cycles:k sleep 1 Performance counter stats for 'sleep 1': 0 cycles:ku 1.002358127 seconds time elapsed 0.002384000 seconds user 0.000000000 seconds sys New behavior with paranoid 2: $ sudo bash -c "echo 2 > /proc/sys/kernel/perf_event_paranoid" $ perf stat -e cycles:k sleep 1 Error: You may not have permission to collect stats. Consider tweaking /proc/sys/kernel/perf_event_paranoid, which controls use of the performance events system by unprivileged users (without CAP_PERFMON or CAP_SYS_ADMIN). The current value is 2: -1: Allow use of (almost) all events by all users Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK >= 0: Disallow ftrace function tracepoint by users without CAP_PERFMON or CAP_SYS_ADMIN Disallow raw tracepoint access by users without CAP_SYS_PERFMON or CAP_SYS_ADMIN >= 1: Disallow CPU event access by users without CAP_PERFMON or CAP_SYS_ADMIN >= 2: Disallow kernel profiling by users without CAP_PERFMON or CAP_SYS_ADMIN To make this setting permanent, edit /etc/sysctl.conf too, e.g.: kernel.perf_event_paranoid = -1 v2 of this patch addresses the review feedback from jolsa@redhat.com. Signed-off-by: Stephane Eranian Reviewed-by: Ian Rogers Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200414161550.225588-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 8300e8c7aea8..6a571d322bb2 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2427,6 +2427,10 @@ bool perf_evsel__fallback(struct evsel *evsel, int err, char *new_name; const char *sep = ":"; + /* If event has exclude user then don't exclude kernel. */ + if (evsel->core.attr.exclude_user) + return false; + /* Is there already the separator in the name. */ if (strchr(name, '/') || strchr(name, ':')) From f8ff18be1f5c6ba1c2befb043bea6e7eaf9f8987 Mon Sep 17 00:00:00 2001 From: He Zhe Date: Thu, 20 Feb 2020 09:58:50 +0800 Subject: [PATCH 43/60] tools lib traceevent: Take care of return value of asprintf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the API, if memory allocation wasn't possible, or some other error occurs, asprintf will return -1, and the contents of strp below are undefined. int asprintf(char **strp, const char *fmt, ...); This patch takes care of return value of asprintf to make it less error prone and prevent the following build warning. ignoring return value of ‘asprintf’, declared with attribute warn_unused_result [-Wunused-result] Signed-off-by: He Zhe Reviewed-by: Steven Rostedt (VMware) Cc: Tzvetomir Stoyanov Cc: hewenliang4@huawei.com Link: http://lore.kernel.org/lkml/1582163930-233692-1-git-send-email-zhe.he@windriver.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/parse-filter.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index 20eed719542e..c271aeeb227d 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -1958,7 +1958,8 @@ static char *op_to_str(struct tep_event_filter *filter, struct tep_filter_arg *a default: break; } - asprintf(&str, val ? "TRUE" : "FALSE"); + if (asprintf(&str, val ? "TRUE" : "FALSE") < 0) + str = NULL; break; } } @@ -1976,7 +1977,8 @@ static char *op_to_str(struct tep_event_filter *filter, struct tep_filter_arg *a break; } - asprintf(&str, "(%s) %s (%s)", left, op, right); + if (asprintf(&str, "(%s) %s (%s)", left, op, right) < 0) + str = NULL; break; case TEP_FILTER_OP_NOT: @@ -1992,10 +1994,12 @@ static char *op_to_str(struct tep_event_filter *filter, struct tep_filter_arg *a right_val = 0; if (right_val >= 0) { /* just return the opposite */ - asprintf(&str, right_val ? "FALSE" : "TRUE"); + if (asprintf(&str, right_val ? "FALSE" : "TRUE") < 0) + str = NULL; break; } - asprintf(&str, "%s(%s)", op, right); + if (asprintf(&str, "%s(%s)", op, right) < 0) + str = NULL; break; default: @@ -2011,7 +2015,8 @@ static char *val_to_str(struct tep_event_filter *filter, struct tep_filter_arg * { char *str = NULL; - asprintf(&str, "%lld", arg->value.val); + if (asprintf(&str, "%lld", arg->value.val) < 0) + str = NULL; return str; } @@ -2069,7 +2074,8 @@ static char *exp_to_str(struct tep_event_filter *filter, struct tep_filter_arg * break; } - asprintf(&str, "%s %s %s", lstr, op, rstr); + if (asprintf(&str, "%s %s %s", lstr, op, rstr) < 0) + str = NULL; out: free(lstr); free(rstr); @@ -2113,7 +2119,8 @@ static char *num_to_str(struct tep_event_filter *filter, struct tep_filter_arg * if (!op) op = "<="; - asprintf(&str, "%s %s %s", lstr, op, rstr); + if (asprintf(&str, "%s %s %s", lstr, op, rstr) < 0) + str = NULL; break; default: @@ -2148,8 +2155,9 @@ static char *str_to_str(struct tep_event_filter *filter, struct tep_filter_arg * if (!op) op = "!~"; - asprintf(&str, "%s %s \"%s\"", - arg->str.field->name, op, arg->str.val); + if (asprintf(&str, "%s %s \"%s\"", + arg->str.field->name, op, arg->str.val) < 0) + str = NULL; break; default: @@ -2165,7 +2173,8 @@ static char *arg_to_str(struct tep_event_filter *filter, struct tep_filter_arg * switch (arg->type) { case TEP_FILTER_ARG_BOOLEAN: - asprintf(&str, arg->boolean.value ? "TRUE" : "FALSE"); + if (asprintf(&str, arg->boolean.value ? "TRUE" : "FALSE") < 0) + str = NULL; return str; case TEP_FILTER_ARG_OP: From 9fbc61f832ebf432326a90e28184dade05ee34a8 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:01 -0700 Subject: [PATCH 44/60] perf pmu: Add support for PMU capabilities The PMU capabilities information, which is located at /sys/bus/event_source/devices//caps, is required by perf tool. For example, the max LBR information is required to stitch LBR call stack. Add perf_pmu__caps_parse() to parse the PMU capabilities information. The information is stored in a list. The following patch will store the capabilities information in perf header. Committer notes: Here's an example of such directories and its files in an i5 7th gen machine: [root@seventh ~]# ls -lad /sys/bus/event_source/devices/*/caps drwxr-xr-x. 2 root root 0 Apr 14 13:33 /sys/bus/event_source/devices/cpu/caps drwxr-xr-x. 2 root root 0 Apr 14 13:33 /sys/bus/event_source/devices/intel_pt/caps [root@seventh ~]# ls -la /sys/bus/event_source/devices/intel_pt/caps total 0 drwxr-xr-x. 2 root root 0 Apr 14 13:33 . drwxr-xr-x. 5 root root 0 Apr 14 13:12 .. -r--r--r--. 1 root root 4096 Apr 16 13:10 cr3_filtering -r--r--r--. 1 root root 4096 Apr 16 11:42 cycle_thresholds -r--r--r--. 1 root root 4096 Apr 16 13:10 ip_filtering -r--r--r--. 1 root root 4096 Apr 16 13:10 max_subleaf -r--r--r--. 1 root root 4096 Apr 14 13:33 mtc -r--r--r--. 1 root root 4096 Apr 14 13:33 mtc_periods -r--r--r--. 1 root root 4096 Apr 16 13:10 num_address_ranges -r--r--r--. 1 root root 4096 Apr 16 13:10 output_subsys -r--r--r--. 1 root root 4096 Apr 16 13:10 payloads_lip -r--r--r--. 1 root root 4096 Apr 16 13:10 power_event_trace -r--r--r--. 1 root root 4096 Apr 14 13:33 psb_cyc -r--r--r--. 1 root root 4096 Apr 14 13:33 psb_periods -r--r--r--. 1 root root 4096 Apr 16 13:10 ptwrite -r--r--r--. 1 root root 4096 Apr 16 13:10 single_range_output -r--r--r--. 1 root root 4096 Apr 16 12:03 topa_multiple_entries -r--r--r--. 1 root root 4096 Apr 16 13:10 topa_output [root@seventh ~]# cat /sys/bus/event_source/devices/intel_pt/caps/topa_output 1 [root@seventh ~]# cat /sys/bus/event_source/devices/intel_pt/caps/topa_multiple_entries 1 [root@seventh ~]# cat /sys/bus/event_source/devices/intel_pt/caps/mtc 1 [root@seventh ~]# cat /sys/bus/event_source/devices/intel_pt/caps/power_event_trace 0 [root@seventh ~]# [root@seventh ~]# ls -la /sys/bus/event_source/devices/cpu/caps/ total 0 drwxr-xr-x. 2 root root 0 Apr 14 13:33 . drwxr-xr-x. 6 root root 0 Apr 14 13:12 .. -r--r--r--. 1 root root 4096 Apr 16 13:10 branches -r--r--r--. 1 root root 4096 Apr 14 13:33 max_precise -r--r--r--. 1 root root 4096 Apr 16 13:10 pmu_name [root@seventh ~]# cat /sys/bus/event_source/devices/cpu/caps/max_precise 3 [root@seventh ~]# cat /sys/bus/event_source/devices/cpu/caps/branches 32 [root@seventh ~]# cat /sys/bus/event_source/devices/cpu/caps/pmu_name skylake [root@seventh ~]# Wow, first time I've heard about /sys/bus/event_source/devices/cpu/caps/max_precise, I think I'll use it! :-) Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 82 +++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/pmu.h | 9 +++++ 2 files changed, 91 insertions(+) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index bc912a84b5e9..d9f89ed18dea 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -850,6 +850,7 @@ static struct perf_pmu *pmu_lookup(const char *name) INIT_LIST_HEAD(&pmu->format); INIT_LIST_HEAD(&pmu->aliases); + INIT_LIST_HEAD(&pmu->caps); list_splice(&format, &pmu->format); list_splice(&aliases, &pmu->aliases); list_add_tail(&pmu->list, &pmus); @@ -1594,3 +1595,84 @@ int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, va_end(args); return ret; } + +static int perf_pmu__new_caps(struct list_head *list, char *name, char *value) +{ + struct perf_pmu_caps *caps = zalloc(sizeof(*caps)); + + if (!caps) + return -ENOMEM; + + caps->name = strdup(name); + if (!caps->name) + goto free_caps; + caps->value = strndup(value, strlen(value) - 1); + if (!caps->value) + goto free_name; + list_add_tail(&caps->list, list); + return 0; + +free_name: + zfree(caps->name); +free_caps: + free(caps); + + return -ENOMEM; +} + +/* + * Reading/parsing the given pmu capabilities, which should be located at: + * /sys/bus/event_source/devices//caps as sysfs group attributes. + * Return the number of capabilities + */ +int perf_pmu__caps_parse(struct perf_pmu *pmu) +{ + struct stat st; + char caps_path[PATH_MAX]; + const char *sysfs = sysfs__mountpoint(); + DIR *caps_dir; + struct dirent *evt_ent; + int nr_caps = 0; + + if (!sysfs) + return -1; + + snprintf(caps_path, PATH_MAX, + "%s" EVENT_SOURCE_DEVICE_PATH "%s/caps", sysfs, pmu->name); + + if (stat(caps_path, &st) < 0) + return 0; /* no error if caps does not exist */ + + caps_dir = opendir(caps_path); + if (!caps_dir) + return -EINVAL; + + while ((evt_ent = readdir(caps_dir)) != NULL) { + char path[PATH_MAX + NAME_MAX + 1]; + char *name = evt_ent->d_name; + char value[128]; + FILE *file; + + if (!strcmp(name, ".") || !strcmp(name, "..")) + continue; + + snprintf(path, sizeof(path), "%s/%s", caps_path, name); + + file = fopen(path, "r"); + if (!file) + continue; + + if (!fgets(value, sizeof(value), file) || + (perf_pmu__new_caps(&pmu->caps, name, value) < 0)) { + fclose(file); + continue; + } + + nr_caps++; + fclose(file); + } + + closedir(caps_dir); + + return nr_caps; +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 5fb3f16828df..1edd214b75a5 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -22,6 +22,12 @@ enum { struct perf_event_attr; +struct perf_pmu_caps { + char *name; + char *value; + struct list_head list; +}; + struct perf_pmu { char *name; __u32 type; @@ -33,6 +39,7 @@ struct perf_pmu { struct perf_cpu_map *cpus; struct list_head format; /* HEAD struct perf_pmu_format -> list */ struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */ + struct list_head caps; /* HEAD struct perf_pmu_caps -> list */ struct list_head list; /* ELEM */ }; @@ -107,4 +114,6 @@ bool pmu_uncore_alias_match(const char *pmu_name, const char *name); int perf_pmu__convert_scale(const char *scale, char **end, double *sval); +int perf_pmu__caps_parse(struct perf_pmu *pmu); + #endif /* __PMU_H */ From e9cfa47e687d77d256610b7124d736776f137ea0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 16 Apr 2020 09:20:55 -0700 Subject: [PATCH 45/60] perf doc: allow ASCIIDOC_EXTRA to be an argument This will allow parent makefiles to pass values to asciidoc. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Alexey Budankov Cc: Andi Kleen Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Florian Fainelli Cc: Greg Kroah-Hartman Cc: Igor Lubashev Cc: Jin Yao Cc: Jiri Olsa Cc: Jiwei Sun Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Yonghong Song Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Cc: yuzhoujian Link: http://lore.kernel.org/lkml/20200416162058.201954-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index 31824d5269cc..6e54979c2124 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -48,7 +48,7 @@ man5dir=$(mandir)/man5 man7dir=$(mandir)/man7 ASCIIDOC=asciidoc -ASCIIDOC_EXTRA = --unsafe -f asciidoc.conf +ASCIIDOC_EXTRA += --unsafe -f asciidoc.conf ASCIIDOC_HTML = xhtml11 MANPAGE_XSL = manpage-normal.xsl XMLTO_EXTRA = @@ -59,7 +59,7 @@ HTML_REF = origin/html ifdef USE_ASCIIDOCTOR ASCIIDOC = asciidoctor -ASCIIDOC_EXTRA = -a compat-mode +ASCIIDOC_EXTRA += -a compat-mode ASCIIDOC_EXTRA += -I. -rasciidoctor-extensions ASCIIDOC_EXTRA += -a mansource="perf" -a manmanual="perf Manual" ASCIIDOC_HTML = xhtml5 From 3a6c51e4d66cf2fbc05583247b2d2f1179e8a74c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 17 Apr 2020 00:14:05 +0200 Subject: [PATCH 46/60] perf parser: Add support to specify rXXX event with pmu The current rXXXX event specification creates event under PERF_TYPE_RAW pmu type. This change allows to use rXXXX within pmu syntax, so it's type is used via the following syntax: -e 'cpu/r3c/' -e 'cpum_cf/r0/' The XXXX number goes directly to perf_event_attr::config the same way as in '-e rXXXX' event. The perf_event_attr::type is filled with pmu type. Committer testing: So, lets see what goes in perf_event_attr::config for, say, the 'instructions' PERF_TYPE_HARDWARE (0) event, first we should look at how to encode this event as a PERF_TYPE_RAW event for this specific CPU, an AMD Ryzen 5: # cat /sys/devices/cpu/events/instructions event=0xc0 # Then try with it _and_ the instruction, just to see that they are close enough: # perf stat -e rc0,instructions sleep 1 Performance counter stats for 'sleep 1': 919,794 rc0 919,898 instructions 1.000754579 seconds time elapsed 0.000715000 seconds user 0.000000000 seconds sys # Now we should try, before this patch, the PMU event encoding: # perf stat -e cpu/rc0/ sleep 1 event syntax error: 'cpu/rc0/' \___ unknown term valid terms: event,edge,inv,umask,cmask,config,config1,config2,name,period,percore # Now with this patch, the three ways of specifying the 'instructions' CPU counter are accepted: # perf stat -e cpu/rc0/,rc0,instructions sleep 1 Performance counter stats for 'sleep 1': 892,948 cpu/rc0/ 893,052 rc0 893,156 instructions 1.000931819 seconds time elapsed 0.000916000 seconds user 0.000000000 seconds sys # Requested-by: Thomas Richter Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: Thomas Richter Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sumanth Korikkar Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20200416221405.437788-1-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 5 +++++ tools/perf/tests/parse-events.c | 17 ++++++++++++++++- tools/perf/util/parse-events.l | 1 + tools/perf/util/parse-events.y | 9 +++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index cb23667531ab..376a50b3452d 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -115,6 +115,11 @@ raw encoding of 0x1A8 can be used: perf stat -e r1a8 -a sleep 1 perf record -e r1a8 ... +It's also possible to use pmu syntax: + + perf record -e r1a8 -a sleep 1 + perf record -e cpu/r1a8/ ... + You should refer to the processor specific documentation for getting these details. Some of them are referenced in the SEE ALSO section below. diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 091c3aeccc27..902bd9d591a0 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1356,6 +1356,16 @@ static int test__checkevent_complex_name(struct evlist *evlist) return 0; } +static int test__checkevent_raw_pmu(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + return 0; +} + static int test__sym_event_slash(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); @@ -1750,7 +1760,12 @@ static struct evlist_test test__events_pmu[] = { .name = "cpu/name='COMPLEX_CYCLES_NAME:orig=cycles,desc=chip-clock-ticks',period=0x1,event=0x2/ukp", .check = test__checkevent_complex_name, .id = 3, - } + }, + { + .name = "software/r1a/", + .check = test__checkevent_raw_pmu, + .id = 4, + }, }; struct terms_test { diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index baa48f28d57d..c589fc42f058 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -286,6 +286,7 @@ no-overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOOVERWRITE); } percore { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_PERCORE); } aux-output { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT); } aux-sample-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); } +r{num_raw_hex} { return raw(yyscanner); } , { return ','; } "/" { BEGIN(INITIAL); return '/'; } {name_minus} { return str(yyscanner, PE_NAME); } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 94f8bcd83582..e879eb257874 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -706,6 +706,15 @@ event_term } event_term: +PE_RAW +{ + struct parse_events_term *term; + + ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_CONFIG, + NULL, $1, false, &@1, NULL)); + $$ = term; +} +| PE_NAME '=' PE_NAME { struct parse_events_term *term; From 6f91ea283a1ed23e4a548ddd62db6deb2c707f82 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:02 -0700 Subject: [PATCH 47/60] perf header: Support CPU PMU capabilities To stitch LBR call stack, the max LBR information is required. So the CPU PMU capabilities information has to be stored in perf header. Add a new feature HEADER_CPU_PMU_CAPS for CPU PMU capabilities. Retrieve all CPU PMU capabilities, not just max LBR information. Add variable max_branches to facilitate future usage. Committer testing: # ls -la /sys/devices/cpu/caps/ total 0 drwxr-xr-x. 2 root root 0 Apr 17 10:53 . drwxr-xr-x. 6 root root 0 Apr 17 07:02 .. -r--r--r--. 1 root root 4096 Apr 17 10:53 max_precise # # cat /sys/devices/cpu/caps/max_precise 0 # perf record sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.033 MB perf.data (7 samples) ] # # perf report --header-only | egrep 'cpu(desc|.*capabilities)' # cpudesc : AMD Ryzen 5 3600X 6-Core Processor # cpu pmu capabilities: max_precise=0 # And then on an Intel machine: $ ls -la /sys/devices/cpu/caps/ total 0 drwxr-xr-x. 2 root root 0 Apr 17 10:51 . drwxr-xr-x. 6 root root 0 Apr 17 10:04 .. -r--r--r--. 1 root root 4096 Apr 17 11:37 branches -r--r--r--. 1 root root 4096 Apr 17 10:51 max_precise -r--r--r--. 1 root root 4096 Apr 17 11:37 pmu_name $ cat /sys/devices/cpu/caps/max_precise 3 $ cat /sys/devices/cpu/caps/branches 32 $ cat /sys/devices/cpu/caps/pmu_name skylake $ perf record sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.001 MB perf.data (8 samples) ] $ perf report --header-only | egrep 'cpu(desc|.*capabilities)' # cpudesc : Intel(R) Core(TM) i5-7500 CPU @ 3.40GHz # cpu pmu capabilities: branches=32, max_precise=3, pmu_name=skylake $ Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-3-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../Documentation/perf.data-file-format.txt | 16 +++ tools/perf/util/env.h | 3 + tools/perf/util/header.c | 108 ++++++++++++++++++ tools/perf/util/header.h | 1 + 4 files changed, 128 insertions(+) diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index b0152e1095c5..b6472e463284 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -373,6 +373,22 @@ struct { Indicates that trace contains records of PERF_RECORD_COMPRESSED type that have perf_events records in compressed form. + HEADER_CPU_PMU_CAPS = 28, + + A list of cpu PMU capabilities. The format of data is as below. + +struct { + u32 nr_cpu_pmu_caps; + { + char name[]; + char value[]; + } [nr_cpu_pmu_caps] +}; + + +Example: + cpu pmu capabilities: branches=32, max_precise=3, pmu_name=icelake + other bits are reserved and should ignored for now HEADER_FEAT_BITS = 256, diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 7632075a8792..1ab2682d5d2b 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -48,6 +48,7 @@ struct perf_env { char *cpuid; unsigned long long total_mem; unsigned int msr_pmu_type; + unsigned int max_branches; int nr_cmdline; int nr_sibling_cores; @@ -57,12 +58,14 @@ struct perf_env { int nr_memory_nodes; int nr_pmu_mappings; int nr_groups; + int nr_cpu_pmu_caps; char *cmdline; const char **cmdline_argv; char *sibling_cores; char *sibling_dies; char *sibling_threads; char *pmu_mappings; + char *cpu_pmu_caps; struct cpu_topology_map *cpu; struct cpu_cache_level *caches; int caches_cnt; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index acbd046bf95c..28e82da04b7a 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1395,6 +1395,38 @@ static int write_compressed(struct feat_fd *ff __maybe_unused, return do_write(ff, &(ff->ph->env.comp_mmap_len), sizeof(ff->ph->env.comp_mmap_len)); } +static int write_cpu_pmu_caps(struct feat_fd *ff, + struct evlist *evlist __maybe_unused) +{ + struct perf_pmu *cpu_pmu = perf_pmu__find("cpu"); + struct perf_pmu_caps *caps = NULL; + int nr_caps; + int ret; + + if (!cpu_pmu) + return -ENOENT; + + nr_caps = perf_pmu__caps_parse(cpu_pmu); + if (nr_caps < 0) + return nr_caps; + + ret = do_write(ff, &nr_caps, sizeof(nr_caps)); + if (ret < 0) + return ret; + + list_for_each_entry(caps, &cpu_pmu->caps, list) { + ret = do_write_string(ff, caps->name); + if (ret < 0) + return ret; + + ret = do_write_string(ff, caps->value); + if (ret < 0) + return ret; + } + + return ret; +} + static void print_hostname(struct feat_fd *ff, FILE *fp) { fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname); @@ -1809,6 +1841,27 @@ static void print_compressed(struct feat_fd *ff, FILE *fp) ff->ph->env.comp_level, ff->ph->env.comp_ratio); } +static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) +{ + const char *delimiter = "# cpu pmu capabilities: "; + u32 nr_caps = ff->ph->env.nr_cpu_pmu_caps; + char *str; + + if (!nr_caps) { + fprintf(fp, "# cpu pmu capabilities: not available\n"); + return; + } + + str = ff->ph->env.cpu_pmu_caps; + while (nr_caps--) { + fprintf(fp, "%s%s", delimiter, str); + delimiter = ", "; + str += strlen(str) + 1; + } + + fprintf(fp, "\n"); +} + static void print_pmu_mappings(struct feat_fd *ff, FILE *fp) { const char *delimiter = "# pmu mappings: "; @@ -2846,6 +2899,60 @@ static int process_compressed(struct feat_fd *ff, return 0; } +static int process_cpu_pmu_caps(struct feat_fd *ff, + void *data __maybe_unused) +{ + char *name, *value; + struct strbuf sb; + u32 nr_caps; + + if (do_read_u32(ff, &nr_caps)) + return -1; + + if (!nr_caps) { + pr_debug("cpu pmu capabilities not available\n"); + return 0; + } + + ff->ph->env.nr_cpu_pmu_caps = nr_caps; + + if (strbuf_init(&sb, 128) < 0) + return -1; + + while (nr_caps--) { + name = do_read_string(ff); + if (!name) + goto error; + + value = do_read_string(ff); + if (!value) + goto free_name; + + if (strbuf_addf(&sb, "%s=%s", name, value) < 0) + goto free_value; + + /* include a NULL character at the end */ + if (strbuf_add(&sb, "", 1) < 0) + goto free_value; + + if (!strcmp(name, "branches")) + ff->ph->env.max_branches = atoi(value); + + free(value); + free(name); + } + ff->ph->env.cpu_pmu_caps = strbuf_detach(&sb, NULL); + return 0; + +free_value: + free(value); +free_name: + free(name); +error: + strbuf_release(&sb); + return -1; +} + #define FEAT_OPR(n, func, __full_only) \ [HEADER_##n] = { \ .name = __stringify(n), \ @@ -2903,6 +3010,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(BPF_PROG_INFO, bpf_prog_info, false), FEAT_OPR(BPF_BTF, bpf_btf, false), FEAT_OPR(COMPRESSED, compressed, false), + FEAT_OPR(CPU_PMU_CAPS, cpu_pmu_caps, false), }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 840f95cee349..650bd1c7a99b 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -43,6 +43,7 @@ enum { HEADER_BPF_PROG_INFO, HEADER_BPF_BTF, HEADER_COMPRESSED, + HEADER_CPU_PMU_CAPS, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; From f8603267bf8589f2a6a3e0a7de0a8dc6b6bd3c7d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:05 -0700 Subject: [PATCH 48/60] perf machine: Remove the indent in resolve_lbr_callchain_sample The indent is unnecessary in resolve_lbr_callchain_sample. Removing it will make the following patch simpler. Current code path for resolve_lbr_callchain_sample() /* LBR only affects the user callchain */ if (i != chain_nr) { body of the function .... return 1; } return 0; With the patch, /* LBR only affects the user callchain */ if (i == chain_nr) return 0; body of the function ... return 1; No functional changes. Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-6-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 121 +++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 59 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 09845eae9c03..be1bd9277471 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2208,6 +2208,12 @@ static int resolve_lbr_callchain_sample(struct thread *thread, int chain_nr = min(max_stack, (int)chain->nr), i; u8 cpumode = PERF_RECORD_MISC_USER; u64 ip, branch_from = 0; + struct branch_stack *lbr_stack; + struct branch_entry *entries; + int lbr_nr, j, k; + bool branch; + struct branch_flags *flags; + int mix_chain_nr; for (i = 0; i < chain_nr; i++) { if (chain->ips[i] == PERF_CONTEXT_USER) @@ -2215,71 +2221,68 @@ static int resolve_lbr_callchain_sample(struct thread *thread, } /* LBR only affects the user callchain */ - if (i != chain_nr) { - struct branch_stack *lbr_stack = sample->branch_stack; - struct branch_entry *entries = perf_sample__branch_entries(sample); - int lbr_nr = lbr_stack->nr, j, k; - bool branch; - struct branch_flags *flags; - /* - * LBR callstack can only get user call chain. - * The mix_chain_nr is kernel call chain - * number plus LBR user call chain number. - * i is kernel call chain number, - * 1 is PERF_CONTEXT_USER, - * lbr_nr + 1 is the user call chain number. - * For details, please refer to the comments - * in callchain__printf - */ - int mix_chain_nr = i + 1 + lbr_nr + 1; + if (i == chain_nr) + return 0; - for (j = 0; j < mix_chain_nr; j++) { - int err; - branch = false; - flags = NULL; + lbr_stack = sample->branch_stack; + entries = perf_sample__branch_entries(sample); + lbr_nr = lbr_stack->nr; + /* + * LBR callstack can only get user call chain. + * The mix_chain_nr is kernel call chain + * number plus LBR user call chain number. + * i is kernel call chain number, + * 1 is PERF_CONTEXT_USER, + * lbr_nr + 1 is the user call chain number. + * For details, please refer to the comments + * in callchain__printf + */ + mix_chain_nr = i + 1 + lbr_nr + 1; - if (callchain_param.order == ORDER_CALLEE) { - if (j < i + 1) - ip = chain->ips[j]; - else if (j > i + 1) { - k = j - i - 2; - ip = entries[k].from; - branch = true; - flags = &entries[k].flags; - } else { - ip = entries[0].to; - branch = true; - flags = &entries[0].flags; - branch_from = entries[0].from; - } + for (j = 0; j < mix_chain_nr; j++) { + int err; + + branch = false; + flags = NULL; + + if (callchain_param.order == ORDER_CALLEE) { + if (j < i + 1) + ip = chain->ips[j]; + else if (j > i + 1) { + k = j - i - 2; + ip = entries[k].from; + branch = true; + flags = &entries[k].flags; } else { - if (j < lbr_nr) { - k = lbr_nr - j - 1; - ip = entries[k].from; - branch = true; - flags = &entries[k].flags; - } - else if (j > lbr_nr) - ip = chain->ips[i + 1 - (j - lbr_nr)]; - else { - ip = entries[0].to; - branch = true; - flags = &entries[0].flags; - branch_from = entries[0].from; - } + ip = entries[0].to; + branch = true; + flags = &entries[0].flags; + branch_from = entries[0].from; + } + } else { + if (j < lbr_nr) { + k = lbr_nr - j - 1; + ip = entries[k].from; + branch = true; + flags = &entries[k].flags; + } else if (j > lbr_nr) + ip = chain->ips[i + 1 - (j - lbr_nr)]; + else { + ip = entries[0].to; + branch = true; + flags = &entries[0].flags; + branch_from = entries[0].from; } - - err = add_callchain_ip(thread, cursor, parent, - root_al, &cpumode, ip, - branch, flags, NULL, - branch_from); - if (err) - return (err < 0) ? err : 0; } - return 1; - } - return 0; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + branch, flags, NULL, + branch_from); + if (err) + return (err < 0) ? err : 0; + } + return 1; } static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread, From e48b8311ca4538ec716196a1625812b045999f21 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:06 -0700 Subject: [PATCH 49/60] perf machine: Refine the function for LBR call stack reconstruction LBR only collect the user call stack. To reconstruct a call stack, both kernel call stack and user call stack are required. The function resolve_lbr_callchain_sample() mix the kernel call stack and user call stack. Now, with the help of HW idx, perf tool can reconstruct a more complete call stack by adding some user call stack from previous sample. However, current implementation is hard to be extended to support it. Current code path for resolve_lbr_callchain_sample() for (j = 0; j < mix_chain_nr; j++) { if (ORDER_CALLEE) { if (kernel callchain) Fill callchain info else if (LBR callchain) Fill callchain info } else { if (LBR callchain) Fill callchain info else if (kernel callchain) Fill callchain info } add_callchain_ip(); } With the patch, if (ORDER_CALLEE) { for (j = 0; j < NUM of kernel callchain) { Fill callchain info add_callchain_ip(); } for (; j < mix_chain_nr) { Fill callchain info add_callchain_ip(); } } else { for (; j < NUM of LBR callchain) { Fill callchain info add_callchain_ip(); } for (j = 0; j < mix_chain_nr) { Fill callchain info add_callchain_ip(); } } No functional changes. Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-7-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 113 ++++++++++++++++++++++++++------------ 1 file changed, 77 insertions(+), 36 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index be1bd9277471..0da540e6f803 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2214,6 +2214,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, bool branch; struct branch_flags *flags; int mix_chain_nr; + int err; for (i = 0; i < chain_nr; i++) { if (chain->ips[i] == PERF_CONTEXT_USER) @@ -2239,50 +2240,90 @@ static int resolve_lbr_callchain_sample(struct thread *thread, */ mix_chain_nr = i + 1 + lbr_nr + 1; - for (j = 0; j < mix_chain_nr; j++) { - int err; - - branch = false; - flags = NULL; - - if (callchain_param.order == ORDER_CALLEE) { - if (j < i + 1) - ip = chain->ips[j]; - else if (j > i + 1) { - k = j - i - 2; - ip = entries[k].from; - branch = true; - flags = &entries[k].flags; - } else { - ip = entries[0].to; - branch = true; - flags = &entries[0].flags; - branch_from = entries[0].from; - } - } else { - if (j < lbr_nr) { - k = lbr_nr - j - 1; - ip = entries[k].from; - branch = true; - flags = &entries[k].flags; - } else if (j > lbr_nr) - ip = chain->ips[i + 1 - (j - lbr_nr)]; - else { - ip = entries[0].to; - branch = true; - flags = &entries[0].flags; - branch_from = entries[0].from; - } + if (callchain_param.order == ORDER_CALLEE) { + /* Add kernel ip */ + for (j = 0; j < i + 1; j++) { + ip = chain->ips[j]; + branch = false; + flags = NULL; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + branch, flags, NULL, + branch_from); + if (err) + goto error; } - + /* Add LBR ip from first entries.to */ + ip = entries[0].to; + branch = true; + flags = &entries[0].flags; + branch_from = entries[0].from; err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip, branch, flags, NULL, branch_from); if (err) - return (err < 0) ? err : 0; + goto error; + + /* Add LBR ip from entries.from one by one. */ + for (j = i + 2; j < mix_chain_nr; j++) { + k = j - i - 2; + ip = entries[k].from; + branch = true; + flags = &entries[k].flags; + + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + branch, flags, NULL, + branch_from); + if (err) + goto error; + } + } else { + /* Add LBR ip from entries.from one by one. */ + for (j = 0; j < lbr_nr; j++) { + k = lbr_nr - j - 1; + ip = entries[k].from; + branch = true; + flags = &entries[k].flags; + + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + branch, flags, NULL, + branch_from); + if (err) + goto error; + } + + /* Add LBR ip from first entries.to */ + ip = entries[0].to; + branch = true; + flags = &entries[0].flags; + branch_from = entries[0].from; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + branch, flags, NULL, + branch_from); + if (err) + goto error; + + /* Add kernel ip */ + for (j = lbr_nr + 1; j < mix_chain_nr; j++) { + ip = chain->ips[i + 1 - (j - lbr_nr)]; + branch = false; + flags = NULL; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + branch, flags, NULL, + branch_from); + if (err) + goto error; + } } return 1; + +error: + return (err < 0) ? err : 0; } static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread, From dd3e249a0c0ad88098922803b149c788bb364c23 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:07 -0700 Subject: [PATCH 50/60] perf machine: Factor out lbr_callchain_add_kernel_ip() Both caller and callee needs to add kernel ip to callchain. Factor out lbr_callchain_add_kernel_ip() to improve code readability. Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-8-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 67 ++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 0da540e6f803..a7f75fd43b0f 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2190,6 +2190,40 @@ static int remove_loops(struct branch_entry *l, int nr, return nr; } +static int lbr_callchain_add_kernel_ip(struct thread *thread, + struct callchain_cursor *cursor, + struct perf_sample *sample, + struct symbol **parent, + struct addr_location *root_al, + u64 branch_from, + bool callee, int end) +{ + struct ip_callchain *chain = sample->callchain; + u8 cpumode = PERF_RECORD_MISC_USER; + int err, i; + + if (callee) { + for (i = 0; i < end + 1; i++) { + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, chain->ips[i], + false, NULL, NULL, branch_from); + if (err) + return err; + } + return 0; + } + + for (i = end; i >= 0; i--) { + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, chain->ips[i], + false, NULL, NULL, branch_from); + if (err) + return err; + } + + return 0; +} + /* * Recolve LBR callstack chain sample * Return: @@ -2242,17 +2276,12 @@ static int resolve_lbr_callchain_sample(struct thread *thread, if (callchain_param.order == ORDER_CALLEE) { /* Add kernel ip */ - for (j = 0; j < i + 1; j++) { - ip = chain->ips[j]; - branch = false; - flags = NULL; - err = add_callchain_ip(thread, cursor, parent, - root_al, &cpumode, ip, - branch, flags, NULL, - branch_from); - if (err) - goto error; - } + err = lbr_callchain_add_kernel_ip(thread, cursor, sample, + parent, root_al, branch_from, + true, i); + if (err) + goto error; + /* Add LBR ip from first entries.to */ ip = entries[0].to; branch = true; @@ -2308,17 +2337,11 @@ static int resolve_lbr_callchain_sample(struct thread *thread, goto error; /* Add kernel ip */ - for (j = lbr_nr + 1; j < mix_chain_nr; j++) { - ip = chain->ips[i + 1 - (j - lbr_nr)]; - branch = false; - flags = NULL; - err = add_callchain_ip(thread, cursor, parent, - root_al, &cpumode, ip, - branch, flags, NULL, - branch_from); - if (err) - goto error; - } + err = lbr_callchain_add_kernel_ip(thread, cursor, sample, + parent, root_al, branch_from, + false, i); + if (err) + goto error; } return 1; From e2b23483eb1d851b4c48935a995f79b2de41c3ed Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:08 -0700 Subject: [PATCH 51/60] perf machine: Factor out lbr_callchain_add_lbr_ip() Both caller and callee needs to add ip from LBR to callchain. Factor out lbr_callchain_add_lbr_ip() to improve code readability. Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-9-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 143 +++++++++++++++++++------------------- 1 file changed, 73 insertions(+), 70 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index a7f75fd43b0f..f9d69fce584a 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2224,6 +2224,74 @@ static int lbr_callchain_add_kernel_ip(struct thread *thread, return 0; } +static int lbr_callchain_add_lbr_ip(struct thread *thread, + struct callchain_cursor *cursor, + struct perf_sample *sample, + struct symbol **parent, + struct addr_location *root_al, + u64 *branch_from, + bool callee) +{ + struct branch_stack *lbr_stack = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); + u8 cpumode = PERF_RECORD_MISC_USER; + int lbr_nr = lbr_stack->nr; + struct branch_flags *flags; + int err, i; + u64 ip; + + if (callee) { + /* Add LBR ip from first entries.to */ + ip = entries[0].to; + flags = &entries[0].flags; + *branch_from = entries[0].from; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + true, flags, NULL, + *branch_from); + if (err) + return err; + + /* Add LBR ip from entries.from one by one. */ + for (i = 0; i < lbr_nr; i++) { + ip = entries[i].from; + flags = &entries[i].flags; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + true, flags, NULL, + *branch_from); + if (err) + return err; + } + return 0; + } + + /* Add LBR ip from entries.from one by one. */ + for (i = lbr_nr - 1; i >= 0; i--) { + ip = entries[i].from; + flags = &entries[i].flags; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + true, flags, NULL, + *branch_from); + if (err) + return err; + } + + /* Add LBR ip from first entries.to */ + ip = entries[0].to; + flags = &entries[0].flags; + *branch_from = entries[0].from; + err = add_callchain_ip(thread, cursor, parent, + root_al, &cpumode, ip, + true, flags, NULL, + *branch_from); + if (err) + return err; + + return 0; +} + /* * Recolve LBR callstack chain sample * Return: @@ -2240,14 +2308,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, { struct ip_callchain *chain = sample->callchain; int chain_nr = min(max_stack, (int)chain->nr), i; - u8 cpumode = PERF_RECORD_MISC_USER; - u64 ip, branch_from = 0; - struct branch_stack *lbr_stack; - struct branch_entry *entries; - int lbr_nr, j, k; - bool branch; - struct branch_flags *flags; - int mix_chain_nr; + u64 branch_from = 0; int err; for (i = 0; i < chain_nr; i++) { @@ -2259,21 +2320,6 @@ static int resolve_lbr_callchain_sample(struct thread *thread, if (i == chain_nr) return 0; - lbr_stack = sample->branch_stack; - entries = perf_sample__branch_entries(sample); - lbr_nr = lbr_stack->nr; - /* - * LBR callstack can only get user call chain. - * The mix_chain_nr is kernel call chain - * number plus LBR user call chain number. - * i is kernel call chain number, - * 1 is PERF_CONTEXT_USER, - * lbr_nr + 1 is the user call chain number. - * For details, please refer to the comments - * in callchain__printf - */ - mix_chain_nr = i + 1 + lbr_nr + 1; - if (callchain_param.order == ORDER_CALLEE) { /* Add kernel ip */ err = lbr_callchain_add_kernel_ip(thread, cursor, sample, @@ -2282,57 +2328,14 @@ static int resolve_lbr_callchain_sample(struct thread *thread, if (err) goto error; - /* Add LBR ip from first entries.to */ - ip = entries[0].to; - branch = true; - flags = &entries[0].flags; - branch_from = entries[0].from; - err = add_callchain_ip(thread, cursor, parent, - root_al, &cpumode, ip, - branch, flags, NULL, - branch_from); + err = lbr_callchain_add_lbr_ip(thread, cursor, sample, parent, + root_al, &branch_from, true); if (err) goto error; - /* Add LBR ip from entries.from one by one. */ - for (j = i + 2; j < mix_chain_nr; j++) { - k = j - i - 2; - ip = entries[k].from; - branch = true; - flags = &entries[k].flags; - - err = add_callchain_ip(thread, cursor, parent, - root_al, &cpumode, ip, - branch, flags, NULL, - branch_from); - if (err) - goto error; - } } else { - /* Add LBR ip from entries.from one by one. */ - for (j = 0; j < lbr_nr; j++) { - k = lbr_nr - j - 1; - ip = entries[k].from; - branch = true; - flags = &entries[k].flags; - - err = add_callchain_ip(thread, cursor, parent, - root_al, &cpumode, ip, - branch, flags, NULL, - branch_from); - if (err) - goto error; - } - - /* Add LBR ip from first entries.to */ - ip = entries[0].to; - branch = true; - flags = &entries[0].flags; - branch_from = entries[0].from; - err = add_callchain_ip(thread, cursor, parent, - root_al, &cpumode, ip, - branch, flags, NULL, - branch_from); + err = lbr_callchain_add_lbr_ip(thread, cursor, sample, parent, + root_al, &branch_from, false); if (err) goto error; From 771fd155dfaa5332da69d606db16fe27bd9d388d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:09 -0700 Subject: [PATCH 52/60] perf thread: Add a knob for LBR stitch approach The LBR stitch approach should be disabled by default. Because - The stitching approach base on LBR call stack technology. The known limitations of LBR call stack technology still apply to the approach, e.g. Exception handing such as setjmp/longjmp will have calls/returns not match. - This approach is not foolproof. There can be cases where it creates incorrect call stacks from incorrect matches. There is no attempt to validate any matches in another way. The 'lbr_stitch_enable' is used to indicate whether enable LBR stitch approach, which is disabled by default. The following patch will introduce a new option for each tools to enable the LBR stitch approach. Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-10-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread.c | 1 + tools/perf/util/thread.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 28b719388028..1f080db23615 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -47,6 +47,7 @@ struct thread *thread__new(pid_t pid, pid_t tid) thread->tid = tid; thread->ppid = -1; thread->cpu = -1; + thread->lbr_stitch_enable = false; INIT_LIST_HEAD(&thread->namespaces_list); INIT_LIST_HEAD(&thread->comm_list); init_rwsem(&thread->namespaces_lock); diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 20b96b5d1f15..95294050cff2 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -46,6 +46,9 @@ struct thread { struct srccode_state srccode_state; bool filter; int filter_entry_depth; + + /* LBR call stack stitch */ + bool lbr_stitch_enable; }; struct machine; From 9c6c3f471d85a9b0bcda3ce6fc1e2646685e3f60 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:10 -0700 Subject: [PATCH 53/60] perf thread: Save previous sample for LBR stitching approach To retrieve the overwritten LBRs from previous sample for LBR stitching approach, perf has to save the previous sample. Only allocate the struct lbr_stitch once, when LBR stitching approach is enabled and kernel supports hw_idx. Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-11-kan.liang@linux.intel.com [ Use zalloc()/zfree() for thread->lbr_stitch ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 23 +++++++++++++++++++++++ tools/perf/util/thread.c | 1 + tools/perf/util/thread.h | 12 ++++++++++++ 3 files changed, 36 insertions(+) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index f9d69fce584a..a54ca09a1d00 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2292,6 +2292,21 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread, return 0; } +static bool alloc_lbr_stitch(struct thread *thread) +{ + if (thread->lbr_stitch) + return true; + + thread->lbr_stitch = zalloc(sizeof(*thread->lbr_stitch)); + if (!thread->lbr_stitch) + goto err; + +err: + pr_warning("Failed to allocate space for stitched LBRs. Disable LBR stitch\n"); + thread->lbr_stitch_enable = false; + return false; +} + /* * Recolve LBR callstack chain sample * Return: @@ -2308,6 +2323,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, { struct ip_callchain *chain = sample->callchain; int chain_nr = min(max_stack, (int)chain->nr), i; + struct lbr_stitch *lbr_stitch; u64 branch_from = 0; int err; @@ -2320,6 +2336,13 @@ static int resolve_lbr_callchain_sample(struct thread *thread, if (i == chain_nr) return 0; + if (thread->lbr_stitch_enable && !sample->no_hw_idx && + alloc_lbr_stitch(thread)) { + lbr_stitch = thread->lbr_stitch; + + memcpy(&lbr_stitch->prev_sample, sample, sizeof(*sample)); + } + if (callchain_param.order == ORDER_CALLEE) { /* Add kernel ip */ err = lbr_callchain_add_kernel_ip(thread, cursor, sample, diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 1f080db23615..8d0da260c84c 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -111,6 +111,7 @@ void thread__delete(struct thread *thread) exit_rwsem(&thread->namespaces_lock); exit_rwsem(&thread->comm_lock); + thread__free_stitch_list(thread); free(thread); } diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 95294050cff2..34eb61cee6a4 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -13,6 +14,7 @@ #include #include #include "rwsem.h" +#include "event.h" struct addr_location; struct map; @@ -20,6 +22,10 @@ struct perf_record_namespaces; struct thread_stack; struct unwind_libunwind_ops; +struct lbr_stitch { + struct perf_sample prev_sample; +}; + struct thread { union { struct rb_node rb_node; @@ -49,6 +55,7 @@ struct thread { /* LBR call stack stitch */ bool lbr_stitch_enable; + struct lbr_stitch *lbr_stitch; }; struct machine; @@ -145,4 +152,9 @@ static inline bool thread__is_filtered(struct thread *thread) return false; } +static inline void thread__free_stitch_list(struct thread *thread) +{ + zfree(&thread->lbr_stitch); +} + #endif /* __PERF_THREAD_H */ From 7f1d39317c071268b4204175df7cfbb2187acb72 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:11 -0700 Subject: [PATCH 54/60] perf callchain: Save previous cursor nodes for LBR stitching approach The cursor nodes which generates from sample are eventually added into callchain. To avoid generating cursor nodes from previous samples again, the previous cursor nodes are also saved for LBR stitching approach. Some option, e.g. hide-unresolved, may hide some LBRs. Add a variable 'valid' in struct callchain_cursor_node to indicate this case. The LBR stitching approach will only append the valid cursor nodes from previous samples later. Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-12-kan.liang@linux.intel.com [ Use zfree() instead of open coded equivalent, and use it when freeing members of structs ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.h | 3 ++ tools/perf/util/machine.c | 76 +++++++++++++++++++++++++++++++++++-- tools/perf/util/thread.h | 8 ++++ 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 706bb7bbe1e1..cb33cd42ff43 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -143,6 +143,9 @@ struct callchain_cursor_node { u64 ip; struct map_symbol ms; const char *srcline; + /* Indicate valid cursor node for LBR stitch */ + bool valid; + bool branch; struct branch_flags branch_flags; u64 branch_from; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index a54ca09a1d00..737dee723a57 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2224,6 +2224,31 @@ static int lbr_callchain_add_kernel_ip(struct thread *thread, return 0; } +static void save_lbr_cursor_node(struct thread *thread, + struct callchain_cursor *cursor, + int idx) +{ + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + + if (!lbr_stitch) + return; + + if (cursor->pos == cursor->nr) { + lbr_stitch->prev_lbr_cursor[idx].valid = false; + return; + } + + if (!cursor->curr) + cursor->curr = cursor->first; + else + cursor->curr = cursor->curr->next; + memcpy(&lbr_stitch->prev_lbr_cursor[idx], cursor->curr, + sizeof(struct callchain_cursor_node)); + + lbr_stitch->prev_lbr_cursor[idx].valid = true; + cursor->pos++; +} + static int lbr_callchain_add_lbr_ip(struct thread *thread, struct callchain_cursor *cursor, struct perf_sample *sample, @@ -2240,6 +2265,21 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread, int err, i; u64 ip; + /* + * The curr and pos are not used in writing session. They are cleared + * in callchain_cursor_commit() when the writing session is closed. + * Using curr and pos to track the current cursor node. + */ + if (thread->lbr_stitch) { + cursor->curr = NULL; + cursor->pos = cursor->nr; + if (cursor->nr) { + cursor->curr = cursor->first; + for (i = 0; i < (int)(cursor->nr - 1); i++) + cursor->curr = cursor->curr->next; + } + } + if (callee) { /* Add LBR ip from first entries.to */ ip = entries[0].to; @@ -2252,6 +2292,20 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread, if (err) return err; + /* + * The number of cursor node increases. + * Move the current cursor node. + * But does not need to save current cursor node for entry 0. + * It's impossible to stitch the whole LBRs of previous sample. + */ + if (thread->lbr_stitch && (cursor->pos != cursor->nr)) { + if (!cursor->curr) + cursor->curr = cursor->first; + else + cursor->curr = cursor->curr->next; + cursor->pos++; + } + /* Add LBR ip from entries.from one by one. */ for (i = 0; i < lbr_nr; i++) { ip = entries[i].from; @@ -2262,6 +2316,7 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread, *branch_from); if (err) return err; + save_lbr_cursor_node(thread, cursor, i); } return 0; } @@ -2276,6 +2331,7 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread, *branch_from); if (err) return err; + save_lbr_cursor_node(thread, cursor, i); } /* Add LBR ip from first entries.to */ @@ -2292,7 +2348,7 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread, return 0; } -static bool alloc_lbr_stitch(struct thread *thread) +static bool alloc_lbr_stitch(struct thread *thread, unsigned int max_lbr) { if (thread->lbr_stitch) return true; @@ -2301,6 +2357,14 @@ static bool alloc_lbr_stitch(struct thread *thread) if (!thread->lbr_stitch) goto err; + thread->lbr_stitch->prev_lbr_cursor = calloc(max_lbr + 1, sizeof(struct callchain_cursor_node)); + if (!thread->lbr_stitch->prev_lbr_cursor) + goto free_lbr_stitch; + + return true; + +free_lbr_stitch: + zfree(&thread->lbr_stitch); err: pr_warning("Failed to allocate space for stitched LBRs. Disable LBR stitch\n"); thread->lbr_stitch_enable = false; @@ -2319,7 +2383,8 @@ static int resolve_lbr_callchain_sample(struct thread *thread, struct perf_sample *sample, struct symbol **parent, struct addr_location *root_al, - int max_stack) + int max_stack, + unsigned int max_lbr) { struct ip_callchain *chain = sample->callchain; int chain_nr = min(max_stack, (int)chain->nr), i; @@ -2337,7 +2402,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, return 0; if (thread->lbr_stitch_enable && !sample->no_hw_idx && - alloc_lbr_stitch(thread)) { + (max_lbr > 0) && alloc_lbr_stitch(thread, max_lbr)) { lbr_stitch = thread->lbr_stitch; memcpy(&lbr_stitch->prev_sample, sample, sizeof(*sample)); @@ -2417,8 +2482,11 @@ static int thread__resolve_callchain_sample(struct thread *thread, chain_nr = chain->nr; if (perf_evsel__has_branch_callstack(evsel)) { + struct perf_env *env = perf_evsel__env(evsel); + err = resolve_lbr_callchain_sample(thread, cursor, sample, parent, - root_al, max_stack); + root_al, max_stack, + !env ? 0 : env->max_branches); if (err) return (err < 0) ? err : 0; } diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 34eb61cee6a4..8456174a52c5 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -15,6 +15,7 @@ #include #include "rwsem.h" #include "event.h" +#include "callchain.h" struct addr_location; struct map; @@ -24,6 +25,7 @@ struct unwind_libunwind_ops; struct lbr_stitch { struct perf_sample prev_sample; + struct callchain_cursor_node *prev_lbr_cursor; }; struct thread { @@ -154,6 +156,12 @@ static inline bool thread__is_filtered(struct thread *thread) static inline void thread__free_stitch_list(struct thread *thread) { + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + + if (!lbr_stitch) + return; + + zfree(&lbr_stitch->prev_lbr_cursor); zfree(&thread->lbr_stitch); } From ff165628d72644e37674c5485658e8bd9f4a348b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:12 -0700 Subject: [PATCH 55/60] perf callchain: Stitch LBR call stack In LBR call stack mode, the depth of reconstructed LBR call stack limits to the number of LBR registers. For example, on skylake, the depth of reconstructed LBR call stack is always <= 32. # To display the perf.data header info, please use # --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 6K of event 'cycles' # Event count (approx.): 6487119731 # # Children Self Command Shared Object Symbol # ........ ........ ............... .................. # ................................ 99.97% 99.97% tchain_edit tchain_edit [.] f43 | --99.64%--f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22 f23 f24 f25 f26 f27 f28 f29 f30 f31 f32 f33 f34 f35 f36 f37 f38 f39 f40 f41 f42 f43 For a call stack which is deeper than LBR limit, HW will overwrite the LBR register with oldest branch. Only partial call stacks can be reconstructed. However, the overwritten LBRs may still be retrieved from previous sample. At that moment, HW hasn't overwritten the LBR registers yet. Perf tools can stitch those overwritten LBRs on current call stacks to get a more complete call stack. To determine if LBRs can be stitched, perf tools need to compare current sample with previous sample. - They should have identical LBR records (Same from, to and flags values, and the same physical index of LBR registers). - The searching starts from the base-of-stack of current sample. Once perf determines to stitch the previous LBRs, the corresponding LBR cursor nodes will be copied to 'lists'. The 'lists' is to track the LBR cursor nodes which are going to be stitched. When the stitching is over, the nodes will not be freed immediately. They will be moved to 'free_lists'. Next stitching may reuse the space. Both 'lists' and 'free_lists' will be freed when all samples are processed. Committer notes: Fix the intel-pt.c initialization of the union with 'struct branch_flags', that breaks the build with its unnamed union on older gcc versions. Uninline thread__free_stitch_list(), as it grew big and started dragging includes to thread.h, so move it to thread.c where what it needs in terms of headers are already there. This fixes the build in several systems such as debian:experimental when cross building to the MIPS32 architecture, i.e. in the other cases what was needed was being included by sheer luck. In file included from builtin-sched.c:11: util/thread.h: In function 'thread__free_stitch_list': util/thread.h:169:3: error: implicit declaration of function 'free' [-Werror=implicit-function-declaration] 169 | free(pos); | ^~~~ util/thread.h:169:3: error: incompatible implicit declaration of built-in function 'free' [-Werror] util/thread.h:19:1: note: include '' or provide a declaration of 'free' 18 | #include "callchain.h" +++ |+#include 19 | util/thread.h:174:3: error: incompatible implicit declaration of built-in function 'free' [-Werror] 174 | free(pos); | ^~~~ util/thread.h:174:3: note: include '' or provide a declaration of 'free' Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-13-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/branch.h | 19 +++-- tools/perf/util/callchain.h | 5 ++ tools/perf/util/intel-pt.c | 17 +++-- tools/perf/util/machine.c | 139 +++++++++++++++++++++++++++++++++++- tools/perf/util/thread.c | 22 ++++++ tools/perf/util/thread.h | 14 +--- 6 files changed, 188 insertions(+), 28 deletions(-) diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h index 154a05cd03af..4d3f02fa223d 100644 --- a/tools/perf/util/branch.h +++ b/tools/perf/util/branch.h @@ -15,13 +15,18 @@ #include "event.h" struct branch_flags { - u64 mispred:1; - u64 predicted:1; - u64 in_tx:1; - u64 abort:1; - u64 cycles:16; - u64 type:4; - u64 reserved:40; + union { + u64 value; + struct { + u64 mispred:1; + u64 predicted:1; + u64 in_tx:1; + u64 abort:1; + u64 cycles:16; + u64 type:4; + u64 reserved:40; + }; + }; }; struct branch_info { diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index cb33cd42ff43..8f668ee29f25 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -154,6 +154,11 @@ struct callchain_cursor_node { struct callchain_cursor_node *next; }; +struct stitch_list { + struct list_head node; + struct callchain_cursor_node cursor; +}; + struct callchain_cursor { u64 nr; struct callchain_cursor_node *first; diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index a659b4a1b3f2..4be7634dccf5 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1717,15 +1717,14 @@ static u64 intel_pt_lbr_flags(u64 info) union { struct branch_flags flags; u64 result; - } u = { - .flags = { - .mispred = !!(info & LBR_INFO_MISPRED), - .predicted = !(info & LBR_INFO_MISPRED), - .in_tx = !!(info & LBR_INFO_IN_TX), - .abort = !!(info & LBR_INFO_ABORT), - .cycles = info & LBR_INFO_CYCLES, - } - }; + } u; + + u.result = 0; + u.flags.mispred = !!(info & LBR_INFO_MISPRED); + u.flags.predicted = !(info & LBR_INFO_MISPRED); + u.flags.in_tx = !!(info & LBR_INFO_IN_TX); + u.flags.abort = !!(info & LBR_INFO_ABORT); + u.flags.cycles = info & LBR_INFO_CYCLES; return u.result; } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 737dee723a57..5ac32cabe4e6 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2348,6 +2348,119 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread, return 0; } +static int lbr_callchain_add_stitched_lbr_ip(struct thread *thread, + struct callchain_cursor *cursor) +{ + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct callchain_cursor_node *cnode; + struct stitch_list *stitch_node; + int err; + + list_for_each_entry(stitch_node, &lbr_stitch->lists, node) { + cnode = &stitch_node->cursor; + + err = callchain_cursor_append(cursor, cnode->ip, + &cnode->ms, + cnode->branch, + &cnode->branch_flags, + cnode->nr_loop_iter, + cnode->iter_cycles, + cnode->branch_from, + cnode->srcline); + if (err) + return err; + } + return 0; +} + +static struct stitch_list *get_stitch_node(struct thread *thread) +{ + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct stitch_list *stitch_node; + + if (!list_empty(&lbr_stitch->free_lists)) { + stitch_node = list_first_entry(&lbr_stitch->free_lists, + struct stitch_list, node); + list_del(&stitch_node->node); + + return stitch_node; + } + + return malloc(sizeof(struct stitch_list)); +} + +static bool has_stitched_lbr(struct thread *thread, + struct perf_sample *cur, + struct perf_sample *prev, + unsigned int max_lbr, + bool callee) +{ + struct branch_stack *cur_stack = cur->branch_stack; + struct branch_entry *cur_entries = perf_sample__branch_entries(cur); + struct branch_stack *prev_stack = prev->branch_stack; + struct branch_entry *prev_entries = perf_sample__branch_entries(prev); + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + int i, j, nr_identical_branches = 0; + struct stitch_list *stitch_node; + u64 cur_base, distance; + + if (!cur_stack || !prev_stack) + return false; + + /* Find the physical index of the base-of-stack for current sample. */ + cur_base = max_lbr - cur_stack->nr + cur_stack->hw_idx + 1; + + distance = (prev_stack->hw_idx > cur_base) ? (prev_stack->hw_idx - cur_base) : + (max_lbr + prev_stack->hw_idx - cur_base); + /* Previous sample has shorter stack. Nothing can be stitched. */ + if (distance + 1 > prev_stack->nr) + return false; + + /* + * Check if there are identical LBRs between two samples. + * Identicall LBRs must have same from, to and flags values. Also, + * they have to be saved in the same LBR registers (same physical + * index). + * + * Starts from the base-of-stack of current sample. + */ + for (i = distance, j = cur_stack->nr - 1; (i >= 0) && (j >= 0); i--, j--) { + if ((prev_entries[i].from != cur_entries[j].from) || + (prev_entries[i].to != cur_entries[j].to) || + (prev_entries[i].flags.value != cur_entries[j].flags.value)) + break; + nr_identical_branches++; + } + + if (!nr_identical_branches) + return false; + + /* + * Save the LBRs between the base-of-stack of previous sample + * and the base-of-stack of current sample into lbr_stitch->lists. + * These LBRs will be stitched later. + */ + for (i = prev_stack->nr - 1; i > (int)distance; i--) { + + if (!lbr_stitch->prev_lbr_cursor[i].valid) + continue; + + stitch_node = get_stitch_node(thread); + if (!stitch_node) + return false; + + memcpy(&stitch_node->cursor, &lbr_stitch->prev_lbr_cursor[i], + sizeof(struct callchain_cursor_node)); + + if (callee) + list_add(&stitch_node->node, &lbr_stitch->lists); + else + list_add_tail(&stitch_node->node, &lbr_stitch->lists); + } + + return true; +} + static bool alloc_lbr_stitch(struct thread *thread, unsigned int max_lbr) { if (thread->lbr_stitch) @@ -2361,6 +2474,9 @@ static bool alloc_lbr_stitch(struct thread *thread, unsigned int max_lbr) if (!thread->lbr_stitch->prev_lbr_cursor) goto free_lbr_stitch; + INIT_LIST_HEAD(&thread->lbr_stitch->lists); + INIT_LIST_HEAD(&thread->lbr_stitch->free_lists); + return true; free_lbr_stitch: @@ -2386,9 +2502,11 @@ static int resolve_lbr_callchain_sample(struct thread *thread, int max_stack, unsigned int max_lbr) { + bool callee = (callchain_param.order == ORDER_CALLEE); struct ip_callchain *chain = sample->callchain; int chain_nr = min(max_stack, (int)chain->nr), i; struct lbr_stitch *lbr_stitch; + bool stitched_lbr = false; u64 branch_from = 0; int err; @@ -2405,10 +2523,18 @@ static int resolve_lbr_callchain_sample(struct thread *thread, (max_lbr > 0) && alloc_lbr_stitch(thread, max_lbr)) { lbr_stitch = thread->lbr_stitch; + stitched_lbr = has_stitched_lbr(thread, sample, + &lbr_stitch->prev_sample, + max_lbr, callee); + + if (!stitched_lbr && !list_empty(&lbr_stitch->lists)) { + list_replace_init(&lbr_stitch->lists, + &lbr_stitch->free_lists); + } memcpy(&lbr_stitch->prev_sample, sample, sizeof(*sample)); } - if (callchain_param.order == ORDER_CALLEE) { + if (callee) { /* Add kernel ip */ err = lbr_callchain_add_kernel_ip(thread, cursor, sample, parent, root_al, branch_from, @@ -2421,7 +2547,18 @@ static int resolve_lbr_callchain_sample(struct thread *thread, if (err) goto error; + if (stitched_lbr) { + err = lbr_callchain_add_stitched_lbr_ip(thread, cursor); + if (err) + goto error; + } + } else { + if (stitched_lbr) { + err = lbr_callchain_add_stitched_lbr_ip(thread, cursor); + if (err) + goto error; + } err = lbr_callchain_add_lbr_ip(thread, cursor, sample, parent, root_al, &branch_from, false); if (err) diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 8d0da260c84c..665e5c0618ed 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -454,3 +454,25 @@ int thread__memcpy(struct thread *thread, struct machine *machine, return dso__data_read_offset(al.map->dso, machine, offset, buf, len); } + +void thread__free_stitch_list(struct thread *thread) +{ + struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct stitch_list *pos, *tmp; + + if (!lbr_stitch) + return; + + list_for_each_entry_safe(pos, tmp, &lbr_stitch->lists, node) { + list_del_init(&pos->node); + free(pos); + } + + list_for_each_entry_safe(pos, tmp, &lbr_stitch->free_lists, node) { + list_del_init(&pos->node); + free(pos); + } + + zfree(&lbr_stitch->prev_lbr_cursor); + zfree(&thread->lbr_stitch); +} diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 8456174a52c5..b066fb30d203 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -24,6 +23,8 @@ struct thread_stack; struct unwind_libunwind_ops; struct lbr_stitch { + struct list_head lists; + struct list_head free_lists; struct perf_sample prev_sample; struct callchain_cursor_node *prev_lbr_cursor; }; @@ -154,15 +155,6 @@ static inline bool thread__is_filtered(struct thread *thread) return false; } -static inline void thread__free_stitch_list(struct thread *thread) -{ - struct lbr_stitch *lbr_stitch = thread->lbr_stitch; - - if (!lbr_stitch) - return; - - zfree(&lbr_stitch->prev_lbr_cursor); - zfree(&thread->lbr_stitch); -} +void thread__free_stitch_list(struct thread *thread); #endif /* __PERF_THREAD_H */ From b1d1429b1820e1587d8588fc05b28ef9af42cfc6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:13 -0700 Subject: [PATCH 56/60] perf report: Add option to enable the LBR stitching approach With the LBR stitching approach, the reconstructed LBR call stack can break the HW limitation. However, it may reconstruct invalid call stacks in some cases, e.g. exception handing such as setjmp/longjmp. Also, it may impact the processing time especially when the number of samples with stitched LBRs are huge. Add an option to enable the approach. # To display the perf.data header info, please use # --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 6K of event 'cycles' # Event count (approx.): 6492797701 # # Children Self Command Shared Object Symbol # ........ ........ ............... .................. # ................................. # 99.99% 99.99% tchain_edit tchain_edit [.] f43 | ---main f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22 f23 f24 f25 f26 f27 f28 f29 f30 f31 | --99.65%--f32 f33 f34 f35 f36 f37 f38 f39 f40 f41 f42 f43 Committer testing: $ perf record --call-graph lbr /wb/tchain_edit [ perf record: Woken up 23 times to write data ] [ perf record: Captured and wrote 5.578 MB perf.data (6839 samples) ] $ perf report --header-only | egrep 'cpu(desc|.*capabilities)' # cpudesc : Intel(R) Core(TM) i5-7500 CPU @ 3.40GHz # cpu pmu capabilities: branches=32, max_precise=3, pmu_name=skylake $ Before: $ perf report --no-children --stdio # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 6K of event 'cycles:u' # Event count (approx.): 6459523879 # # Overhead Command Shared Object Symbol # ........ ........... ................ ....................... # 99.95% tchain_edit tchain_edit [.] f43 | --99.92%--f43 f42 f41 f40 f39 f38 f37 f36 f35 f34 f33 f32 f31 f30 f29 f28 f27 f26 f25 f24 f23 f22 f21 f20 f19 f18 f17 f16 f15 f14 f13 f12 f11 0.03% tchain_edit tchain_edit [.] f42 0.01% tchain_edit tchain_edit [.] f41 0.00% tchain_edit tchain_edit [.] f31 0.00% tchain_edit ld-2.29.so [.] _dl_relocate_object 0.00% tchain_edit ld-2.29.so [.] memmove 0.00% tchain_edit [unknown] [k] 0xffffffff93a00b17 After: $ perf report --stitch-lbr --no-children --stdio # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 6K of event 'cycles:u' # Event count (approx.): 6459496645 # # Overhead Command Shared Object Symbol # ........ ........... ................ ........................ # 99.97% tchain_edit tchain_edit [.] f43 | --99.93%--f43 f42 f41 f40 f39 f38 f37 f36 f35 f34 f33 f32 f31 f30 f29 f28 f27 f26 f25 f24 f23 f22 f21 f20 f19 f18 f17 f16 f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 main __libc_start_main 0.02% tchain_edit [unknown] [k] 0xffffffff93a00b17 0.01% tchain_edit tchain_edit [.] f31 0.00% tchain_edit ld-2.29.so [.] _dl_important_hwcaps Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-14-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 11 +++++++++++ tools/perf/builtin-report.c | 12 ++++++++++++ 2 files changed, 23 insertions(+) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index f569b9ea4002..d068103690cc 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -488,6 +488,17 @@ include::itrace.txt[] This option extends the perf report to show reference callgraphs, which collected by reference event, in no callgraph event. +--stitch-lbr:: + Show callgraph with stitched LBRs, which may have more complete + callgraph. The perf.data file must have been obtained using + perf record --call-graph lbr. + Disabled by default. In common cases with call stack overflows, + it can recreate better call stacks than the default lbr call stack + output. But this approach is not full proof. There can be cases + where it creates incorrect call stacks from incorrect matches. + The known limitations include exception handing such as + setjmp/longjmp will have calls/returns not match. + --socket-filter:: Only report the samples on the processor socket that match with this filter diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index c0cebd53ecf9..0c32767b1c56 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -84,6 +84,7 @@ struct report { bool header_only; bool nonany_branch_mode; bool group_set; + bool stitch_lbr; int max_stack; struct perf_read_values show_threads_values; struct annotation_options annotation_opts; @@ -267,6 +268,9 @@ static int process_sample_event(struct perf_tool *tool, return -1; } + if (rep->stitch_lbr) + al.thread->lbr_stitch_enable = true; + if (symbol_conf.hide_unresolved && al.sym == NULL) goto out_put; @@ -408,6 +412,12 @@ static int report__setup_sample_type(struct report *rep) callchain_param.record_mode = CALLCHAIN_FP; } + if (rep->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) { + ui__warning("Can't find LBR callchain. Switch off --stitch-lbr.\n" + "Please apply --call-graph lbr when recording.\n"); + rep->stitch_lbr = false; + } + /* ??? handle more cases than just ANY? */ if (!(perf_evlist__combined_branch_type(session->evlist) & PERF_SAMPLE_BRANCH_ANY)) @@ -1258,6 +1268,8 @@ int cmd_report(int argc, const char **argv) "Show full source file name path for source lines"), OPT_BOOLEAN(0, "show-ref-call-graph", &symbol_conf.show_ref_callgraph, "Show callgraph from reference event"), + OPT_BOOLEAN(0, "stitch-lbr", &report.stitch_lbr, + "Enable LBR callgraph stitching approach"), OPT_INTEGER(0, "socket-filter", &report.socket_filter, "only show processor socket that match with this filter"), OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace, From 680d125cd522d460b24ccc8b29f03cdb62dea23e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:14 -0700 Subject: [PATCH 57/60] perf script: Add option to enable the LBR stitching approach With the LBR stitching approach, the reconstructed LBR call stack can break the HW limitation. However, it may reconstruct invalid call stacks in some cases, e.g. exception handing such as setjmp/longjmp. Also, it may impact the processing time especially when the number of samples with stitched LBRs are huge. Add an option to enable the approach. Committer testing: Using the same perf.data as with the latest cset committer testing section: $ perf script --stitch-lbr tchain_edit 11131 15164.984292: 437491 cycles:u: 401106 f43+0x0 (/wb/tchain_edit) 40114c f42+0x18 (/wb/tchain_edit) 401172 f41+0xe (/wb/tchain_edit) 401194 f40+0x0 (/wb/tchain_edit) 40119b f39+0x0 (/wb/tchain_edit) 4011a2 f38+0x0 (/wb/tchain_edit) 4011a9 f37+0x0 (/wb/tchain_edit) 4011b0 f36+0x0 (/wb/tchain_edit) 4011b7 f35+0x0 (/wb/tchain_edit) 4011be f34+0x0 (/wb/tchain_edit) 4011c5 f33+0x0 (/wb/tchain_edit) 4011cc f32+0x0 (/wb/tchain_edit) 401207 f31+0x34 (/wb/tchain_edit) 401212 f30+0x0 (/wb/tchain_edit) 401219 f29+0x0 (/wb/tchain_edit) 401220 f28+0x0 (/wb/tchain_edit) 401227 f27+0x0 (/wb/tchain_edit) 40122e f26+0x0 (/wb/tchain_edit) 401235 f25+0x0 (/wb/tchain_edit) 40123c f24+0x0 (/wb/tchain_edit) 401243 f23+0x0 (/wb/tchain_edit) 40124a f22+0x0 (/wb/tchain_edit) 401251 f21+0x0 (/wb/tchain_edit) 401258 f20+0x0 (/wb/tchain_edit) 40125f f19+0x0 (/wb/tchain_edit) 401266 f18+0x0 (/wb/tchain_edit) 40126d f17+0x0 (/wb/tchain_edit) 401274 f16+0x0 (/wb/tchain_edit) 40127b f15+0x0 (/wb/tchain_edit) 401282 f14+0x0 (/wb/tchain_edit) 401289 f13+0x0 (/wb/tchain_edit) 401290 f12+0x0 (/wb/tchain_edit) 401297 f11+0x0 (/wb/tchain_edit) 40129e f10+0x0 (/wb/tchain_edit) 4012a5 f9+0x0 (/wb/tchain_edit) 4012ac f8+0x0 (/wb/tchain_edit) 4012b3 f7+0x0 (/wb/tchain_edit) 4012ba f6+0x0 (/wb/tchain_edit) 4012c1 f5+0x0 (/wb/tchain_edit) 4012c8 f4+0x0 (/wb/tchain_edit) 4012cf f3+0x0 (/wb/tchain_edit) 4012d6 f2+0x0 (/wb/tchain_edit) 4012dd f1+0x0 (/wb/tchain_edit) 4012e4 main+0x0 (/wb/tchain_edit) 7f41a5016f41 __libc_start_main+0xf1 (/usr/lib64/libc-2.29.so) $ Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-15-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 11 +++++++++++ tools/perf/builtin-script.c | 12 ++++++++++++ 2 files changed, 23 insertions(+) diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 963487e82edc..372dfd110e6d 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -440,6 +440,17 @@ include::itrace.txt[] --show-on-off-events:: Show the --switch-on/off events too. +--stitch-lbr:: + Show callgraph with stitched LBRs, which may have more complete + callgraph. The perf.data file must have been obtained using + perf record --call-graph lbr. + Disabled by default. In common cases with call stack overflows, + it can recreate better call stacks than the default lbr call stack + output. But this approach is not full proof. There can be cases + where it creates incorrect call stacks from incorrect matches. + The known limitations include exception handing such as + setjmp/longjmp will have calls/returns not match. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 06b511c0a539..a2236542900d 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1697,6 +1697,7 @@ struct perf_script { bool show_cgroup_events; bool allocated; bool per_event_dump; + bool stitch_lbr; struct evswitch evswitch; struct perf_cpu_map *cpus; struct perf_thread_map *threads; @@ -1923,6 +1924,9 @@ static void process_event(struct perf_script *script, if (PRINT_FIELD(IP)) { struct callchain_cursor *cursor = NULL; + if (script->stitch_lbr) + al->thread->lbr_stitch_enable = true; + if (symbol_conf.use_callchain && sample->callchain && thread__resolve_callchain(al->thread, &callchain_cursor, evsel, sample, NULL, NULL, scripting_max_stack) == 0) @@ -3170,6 +3174,12 @@ static void script__setup_sample_type(struct perf_script *script) else callchain_param.record_mode = CALLCHAIN_FP; } + + if (script->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) { + pr_warning("Can't find LBR callchain. Switch off --stitch-lbr.\n" + "Please apply --call-graph lbr when recording.\n"); + script->stitch_lbr = false; + } } static int process_stat_round_event(struct perf_session *session, @@ -3481,6 +3491,8 @@ int cmd_script(int argc, const char **argv) "file", "file saving guest os /proc/kallsyms"), OPT_STRING(0, "guestmodules", &symbol_conf.default_guest_modules, "file", "file saving guest os /proc/modules"), + OPT_BOOLEAN('\0', "stitch-lbr", &script.stitch_lbr, + "Enable LBR callgraph stitching approach"), OPTS_EVSWITCH(&script.evswitch), OPT_END() }; From 13e0c844fa097f657bd8204fd574477c34f47a0c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:15 -0700 Subject: [PATCH 58/60] perf top: Add option to enable the LBR stitching approach With the LBR stitching approach, the reconstructed LBR call stack can break the HW limitation. However, it may reconstruct invalid call stacks in some cases, e.g. exception handing such as setjmp/longjmp. Also, it may impact the processing time especially when the number of samples with stitched LBRs are huge. Add an option to enable the approach. The option must be used with --call-graph lbr. Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-16-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-top.txt | 9 +++++++++ tools/perf/builtin-top.c | 11 +++++++++++ tools/perf/util/top.h | 1 + 3 files changed, 21 insertions(+) diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 487737a725e9..20227dabc208 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -319,6 +319,15 @@ Default is to monitor all CPUS. go straight to the histogram browser, just like 'perf top' with no events explicitely specified does. +--stitch-lbr:: + Show callgraph with stitched LBRs, which may have more complete + callgraph. The option must be used with --call-graph lbr recording. + Disabled by default. In common cases with call stack overflows, + it can recreate better call stacks than the default lbr call stack + output. But this approach is not full proof. There can be cases + where it creates incorrect call stacks from incorrect matches. + The known limitations include exception handing such as + setjmp/longjmp will have calls/returns not match. INTERACTIVE PROMPTING KEYS -------------------------- diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 289cf83e658a..6b067a5ba1d5 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -33,6 +33,7 @@ #include "util/map.h" #include "util/mmap.h" #include "util/session.h" +#include "util/thread.h" #include "util/symbol.h" #include "util/synthetic-events.h" #include "util/top.h" @@ -775,6 +776,9 @@ static void perf_event__process_sample(struct perf_tool *tool, if (machine__resolve(machine, &al, sample) < 0) return; + if (top->stitch_lbr) + al.thread->lbr_stitch_enable = true; + if (!machine->kptr_restrict_warned && symbol_conf.kptr_restrict && al.cpumode == PERF_RECORD_MISC_KERNEL) { @@ -1571,6 +1575,8 @@ int cmd_top(int argc, const char **argv) "Sort the output by the event at the index n in group. " "If n is invalid, sort by the first event. " "WARNING: should be used on grouped events."), + OPT_BOOLEAN(0, "stitch-lbr", &top.stitch_lbr, + "Enable LBR callgraph stitching approach"), OPTS_EVSWITCH(&top.evswitch), OPT_END() }; @@ -1640,6 +1646,11 @@ int cmd_top(int argc, const char **argv) } } + if (top.stitch_lbr && !(callchain_param.record_mode == CALLCHAIN_LBR)) { + pr_err("Error: --stitch-lbr must be used with --call-graph lbr\n"); + goto out_delete_evlist; + } + if (opts->branch_stack && callchain_param.enabled) symbol_conf.show_branchflag_count = true; diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index f117d4f4821e..45dc84ddff37 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -36,6 +36,7 @@ struct perf_top { bool use_tui, use_stdio; bool vmlinux_warned; bool dump_symtab; + bool stitch_lbr; struct hist_entry *sym_filter_entry; struct evsel *sym_evsel; struct perf_session *session; From d80da766d181555d0c846298b8c619c384c7d179 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:16 -0700 Subject: [PATCH 59/60] perf c2c: Add option to enable the LBR stitching approach With the LBR stitching approach, the reconstructed LBR call stack can break the HW limitation. However, it may reconstruct invalid call stacks in some cases, e.g. exception handing such as setjmp/longjmp. Also, it may impact the processing time especially when the number of samples with stitched LBRs are huge. Add an option to enable the approach. Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-17-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-c2c.txt | 11 +++++++++++ tools/perf/builtin-c2c.c | 12 ++++++++++++ 2 files changed, 23 insertions(+) diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt index e6150f21267d..2133eb320cb0 100644 --- a/tools/perf/Documentation/perf-c2c.txt +++ b/tools/perf/Documentation/perf-c2c.txt @@ -111,6 +111,17 @@ REPORT OPTIONS --display:: Switch to HITM type (rmt, lcl) to display and sort on. Total HITMs as default. +--stitch-lbr:: + Show callgraph with stitched LBRs, which may have more complete + callgraph. The perf.data file must have been obtained using + perf c2c record --call-graph lbr. + Disabled by default. In common cases with call stack overflows, + it can recreate better call stacks than the default lbr call stack + output. But this approach is not full proof. There can be cases + where it creates incorrect call stacks from incorrect matches. + The known limitations include exception handing such as + setjmp/longjmp will have calls/returns not match. + C2C RECORD ---------- The perf c2c record command setup options related to HITM cacheline analysis diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 246ac0b4d54f..0d544c4fb4be 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -95,6 +95,7 @@ struct perf_c2c { bool use_stdio; bool stats_only; bool symbol_full; + bool stitch_lbr; /* HITM shared clines stats */ struct c2c_stats hitm_stats; @@ -273,6 +274,9 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, return -1; } + if (c2c.stitch_lbr) + al.thread->lbr_stitch_enable = true; + ret = sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, sysctl_perf_event_max_stack); if (ret) @@ -2601,6 +2605,12 @@ static int setup_callchain(struct evlist *evlist) } } + if (c2c.stitch_lbr && (mode != CALLCHAIN_LBR)) { + ui__warning("Can't find LBR callchain. Switch off --stitch-lbr.\n" + "Please apply --call-graph lbr when recording.\n"); + c2c.stitch_lbr = false; + } + callchain_param.record_mode = mode; callchain_param.min_percent = 0; return 0; @@ -2752,6 +2762,8 @@ static int perf_c2c__report(int argc, const char **argv) OPT_STRING('c', "coalesce", &coalesce, "coalesce fields", "coalesce fields: pid,tid,iaddr,dso"), OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"), + OPT_BOOLEAN(0, "stitch-lbr", &c2c.stitch_lbr, + "Enable LBR callgraph stitching approach"), OPT_PARENT(c2c_options), OPT_END() }; From 12e89e65f446476951f42aedeef56b6bd6f7f1e6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:17 -0700 Subject: [PATCH 60/60] perf hist: Add fast path for duplicate entries check Perf checks the duplicate entries in a callchain before adding an entry. However the check is very slow especially with deeper call stack. Almost ~50% elapsed time of perf report is spent on the check when the call stack is always depth of 32. The hist_entry__cmp() is used to compare the new entry with the old entries. It will go through all the available sorts in the sort_list, and call the specific cmp of each sort, which is very slow. Actually, for most cases, there are no duplicate entries in callchain. The symbols are usually different. It's much faster to do a quick check for symbols first. Only do the full cmp when the symbols are exactly the same. The quick check is only to check symbols, not dso. Export _sort__sym_cmp. $ perf record --call-graph lbr ./tchain_edit_64 Without the patch $time perf report --stdio real 0m21.142s user 0m21.110s sys 0m0.033s With the patch $time perf report --stdio real 0m10.977s user 0m10.948s sys 0m0.027s Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Andi Kleen Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-18-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 23 +++++++++++++++++++++++ tools/perf/util/sort.c | 2 +- tools/perf/util/sort.h | 2 ++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 283a69ff6a3d..c2550dbe7dc3 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1070,6 +1070,20 @@ iter_next_cumulative_entry(struct hist_entry_iter *iter, return fill_callchain_info(al, node, iter->hide_unresolved); } +static bool +hist_entry__fast__sym_diff(struct hist_entry *left, + struct hist_entry *right) +{ + struct symbol *sym_l = left->ms.sym; + struct symbol *sym_r = right->ms.sym; + + if (!sym_l && !sym_r) + return left->ip != right->ip; + + return !!_sort__sym_cmp(sym_l, sym_r); +} + + static int iter_add_next_cumulative_entry(struct hist_entry_iter *iter, struct addr_location *al) @@ -1096,6 +1110,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter, }; int i; struct callchain_cursor cursor; + bool fast = hists__has(he_tmp.hists, sym); callchain_cursor_snapshot(&cursor, &callchain_cursor); @@ -1106,6 +1121,14 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter, * It's possible that it has cycles or recursive calls. */ for (i = 0; i < iter->curr; i++) { + /* + * For most cases, there are no duplicate entries in callchain. + * The symbols are usually different. Do a quick check for + * symbols first. + */ + if (fast && hist_entry__fast__sym_diff(he_cache[i], &he_tmp)) + continue; + if (hist_entry__cmp(he_cache[i], &he_tmp) == 0) { /* to avoid calling callback function */ iter->he = NULL; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index f14cc728c358..dc15ddc18b7d 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -237,7 +237,7 @@ static int64_t _sort__addr_cmp(u64 left_ip, u64 right_ip) return (int64_t)(right_ip - left_ip); } -static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r) +int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r) { if (!sym_l || !sym_r) return cmp_null(sym_l, sym_r); diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index cfa6ac6f7d06..66d39c4cfe2b 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -311,5 +311,7 @@ int64_t sort__daddr_cmp(struct hist_entry *left, struct hist_entry *right); int64_t sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right); +int64_t +_sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r); char *hist_entry__srcline(struct hist_entry *he); #endif /* __PERF_SORT_H */