diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt
index 8914335db84b..c792fa7b59d3 100644
--- a/tools/perf/Documentation/perf-intel-pt.txt
+++ b/tools/perf/Documentation/perf-intel-pt.txt
@@ -164,7 +164,9 @@ actual number of cycles increases even though the cycles reported does not.
 The number of instructions is known, but if IPC is reported, cycles can be too
 low and so IPC is too high.  Note that inaccuracy decreases as the period of
 sampling increases i.e. if the number of cycles is too low by a small amount,
-that becomes less significant if the number of cycles is large.
+that becomes less significant if the number of cycles is large.  It may also be
+useful to use the 'A' option in conjunction with dlfilter-show-cycles.so to
+provide higher granularity cycle information.
 
 Also note that the IPC instruction count may or may not include the current
 instruction.  If the cycle count is associated with an asynchronous branch
@@ -1082,6 +1084,21 @@ The Z option is equivalent to having recorded a trace without TSC
 decoding a trace of a virtual machine.
 
 
+dlfilter-show-cycles.so
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Cycles can be displayed using dlfilter-show-cycles.so in which case the itrace A
+option can be useful to provide higher granularity cycle information:
+
+	perf script --itrace=A --call-trace --dlfilter dlfilter-show-cycles.so
+
+To see a list of dlfilters:
+
+	perf script -v --list-dlfilters
+
+See also linkperf:perf-dlfilters[1]
+
+
 dump option
 ~~~~~~~~~~~
 
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 1d2b73f99172..a3966f290297 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -362,7 +362,7 @@ ifndef NO_JVMTI
 PROGRAMS += $(OUTPUT)$(LIBJVMTI)
 endif
 
-DLFILTERS := dlfilter-test-api-v0.so
+DLFILTERS := dlfilter-test-api-v0.so dlfilter-show-cycles.so
 DLFILTERS := $(patsubst %,$(OUTPUT)dlfilters/%,$(DLFILTERS))
 
 # what 'all' will build and 'install' will install, in perfexecdir
diff --git a/tools/perf/dlfilters/dlfilter-show-cycles.c b/tools/perf/dlfilters/dlfilter-show-cycles.c
new file mode 100644
index 000000000000..9eccc97bff82
--- /dev/null
+++ b/tools/perf/dlfilters/dlfilter-show-cycles.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * dlfilter-show-cycles.c: Print the number of cycles at the start of each line
+ * Copyright (c) 2021, Intel Corporation.
+ */
+#include <perf/perf_dlfilter.h>
+#include <string.h>
+#include <stdio.h>
+
+#define MAX_CPU 4096
+
+enum {
+	INSTR_CYC,
+	BRNCH_CYC,
+	OTHER_CYC,
+	MAX_ENTRY
+};
+
+static __u64 cycles[MAX_CPU][MAX_ENTRY];
+static __u64 cycles_rpt[MAX_CPU][MAX_ENTRY];
+
+#define BITS		16
+#define TABLESZ		(1 << BITS)
+#define TABLEMAX	(TABLESZ / 2)
+#define MASK		(TABLESZ - 1)
+
+static struct entry {
+	__u32 used;
+	__s32 tid;
+	__u64 cycles[MAX_ENTRY];
+	__u64 cycles_rpt[MAX_ENTRY];
+} table[TABLESZ];
+
+static int tid_cnt;
+
+static int event_entry(const char *event)
+{
+	if (!event)
+		return OTHER_CYC;
+	if (!strncmp(event, "instructions", 12))
+		return INSTR_CYC;
+	if (!strncmp(event, "branches", 8))
+		return BRNCH_CYC;
+	return OTHER_CYC;
+}
+
+static struct entry *find_entry(__s32 tid)
+{
+	__u32 pos = tid & MASK;
+	struct entry *e;
+
+	e = &table[pos];
+	while (e->used) {
+		if (e->tid == tid)
+			return e;
+		if (++pos == TABLESZ)
+			pos = 0;
+		e = &table[pos];
+	}
+
+	if (tid_cnt >= TABLEMAX) {
+		fprintf(stderr, "Too many threads\n");
+		return NULL;
+	}
+
+	tid_cnt += 1;
+	e->used = 1;
+	e->tid = tid;
+	return e;
+}
+
+static void add_entry(__s32 tid, int pos, __u64 cnt)
+{
+	struct entry *e = find_entry(tid);
+
+	if (e)
+		e->cycles[pos] += cnt;
+}
+
+int filter_event_early(void *data, const struct perf_dlfilter_sample *sample, void *ctx)
+{
+	__s32 cpu = sample->cpu;
+	__s32 tid = sample->tid;
+	int pos;
+
+	if (!sample->cyc_cnt)
+		return 0;
+
+	pos = event_entry(sample->event);
+
+	if (cpu >= 0 && cpu < MAX_CPU)
+		cycles[cpu][pos] += sample->cyc_cnt;
+	else if (tid != -1)
+		add_entry(tid, pos, sample->cyc_cnt);
+	return 0;
+}
+
+static void print_vals(__u64 cycles, __u64 delta)
+{
+	if (delta)
+		printf("%10llu %10llu ", cycles, delta);
+	else
+		printf("%10llu %10s ", cycles, "");
+}
+
+int filter_event(void *data, const struct perf_dlfilter_sample *sample, void *ctx)
+{
+	__s32 cpu = sample->cpu;
+	__s32 tid = sample->tid;
+	int pos;
+
+	pos = event_entry(sample->event);
+
+	if (cpu >= 0 && cpu < MAX_CPU) {
+		print_vals(cycles[cpu][pos], cycles[cpu][pos] - cycles_rpt[cpu][pos]);
+		cycles_rpt[cpu][pos] = cycles[cpu][pos];
+		return 0;
+	}
+
+	if (tid != -1) {
+		struct entry *e = find_entry(tid);
+
+		if (e) {
+			print_vals(e->cycles[pos], e->cycles[pos] - e->cycles_rpt[pos]);
+			e->cycles_rpt[pos] = e->cycles[pos];
+			return 0;
+		}
+	}
+
+	printf("%22s", "");
+	return 0;
+}
+
+const char *filter_description(const char **long_description)
+{
+	static char *long_desc = "Cycle counts are accumulated per CPU (or "
+		"per thread if CPU is not recorded) from IPC information, and "
+		"printed together with the change since the last print, at the "
+		"start of each line. Separate counts are kept for branches, "
+		"instructions or other events.";
+
+	*long_description = long_desc;
+	return "Print the number of cycles at the start of each line";
+}