From 301406b9c69e4914cf45ae9d5f929e7bcf0d93cd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 13 Jun 2009 00:11:21 +0200
Subject: [PATCH 01/49] perf annotate: Print the filename:line for annotated
 colored lines

When we have a colored line in perf annotate, ie a middle/high
overhead one, it's sometimes useful to get the matching line
and filename from the source file, especially this path prepares
to another subsequent one which will print a sorted summary of
midle/high overhead lines in the beginning of the output.

Filename:Lines have the same color than the concerned ip lines.

It can be slow because it relies on addr2line. We could also
use objdump with -l but that implies we would have to bufferize
objdump output and parse it to filter the relevant lines since
we want to print a sorted summary in the beginning.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1244844682-12928-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 98 ++++++++++++++++++++++++++++++++++-
 tools/perf/util/symbol.h      |  1 +
 2 files changed, 98 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index b1ed5f766cb3..6a08da41f76b 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -39,6 +39,8 @@ static int		dump_trace = 0;
 
 static int		verbose;
 
+static int		print_line;
+
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
@@ -84,6 +86,12 @@ typedef union event_union {
 	struct period_event		period;
 } event_t;
 
+
+struct sym_ext {
+	double		percent;
+	char		*path;
+};
+
 static LIST_HEAD(dsos);
 static struct dso *kernel_dso;
 static struct dso *vdso;
@@ -1034,6 +1042,8 @@ static int
 parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 {
 	char *line = NULL, *tmp, *tmp2;
+	static const char *prev_line;
+	static const char *prev_color;
 	unsigned int offset;
 	size_t line_len;
 	__u64 line_ip;
@@ -1073,15 +1083,20 @@ parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 	}
 
 	if (line_ip != -1) {
+		const char *path = NULL;
 		unsigned int hits = 0;
 		double percent = 0.0;
 		char *color = PERF_COLOR_NORMAL;
+		struct sym_ext *sym_ext = sym->priv;
 
 		offset = line_ip - start;
 		if (offset < len)
 			hits = sym->hist[offset];
 
-		if (sym->hist_sum)
+		if (sym_ext) {
+			path = sym_ext[offset].path;
+			percent = sym_ext[offset].percent;
+		} else if (sym->hist_sum)
 			percent = 100.0 * hits / sym->hist_sum;
 
 		/*
@@ -1096,6 +1111,20 @@ parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 				color = PERF_COLOR_GREEN;
 		}
 
+		/*
+		 * Also color the filename and line if needed, with
+		 * the same color than the percentage. Don't print it
+		 * twice for close colored ip with the same filename:line
+		 */
+		if (path) {
+			if (!prev_line || strcmp(prev_line, path)
+				       || color != prev_color) {
+				color_fprintf(stdout, color, " %s", path);
+				prev_line = path;
+				prev_color = color;
+			}
+		}
+
 		color_fprintf(stdout, color, " %7.2f", percent);
 		printf(" :	");
 		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", line);
@@ -1109,6 +1138,67 @@ parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 	return 0;
 }
 
+static void free_source_line(struct symbol *sym, int len)
+{
+	struct sym_ext *sym_ext = sym->priv;
+	int i;
+
+	if (!sym_ext)
+		return;
+
+	for (i = 0; i < len; i++)
+		free(sym_ext[i].path);
+	free(sym_ext);
+
+	sym->priv = NULL;
+}
+
+/* Get the filename:line for the colored entries */
+static void get_source_line(struct symbol *sym, __u64 start, int len)
+{
+	int i;
+	char cmd[PATH_MAX * 2];
+	struct sym_ext *sym_ext;
+
+	if (!sym->hist_sum)
+		return;
+
+	sym->priv = calloc(len, sizeof(struct sym_ext));
+	if (!sym->priv)
+		return;
+
+	sym_ext = sym->priv;
+
+	for (i = 0; i < len; i++) {
+		char *path = NULL;
+		size_t line_len;
+		__u64 offset;
+		FILE *fp;
+
+		sym_ext[i].percent = 100.0 * sym->hist[i] / sym->hist_sum;
+		if (sym_ext[i].percent <= 0.5)
+			continue;
+
+		offset = start + i;
+		sprintf(cmd, "addr2line -e %s %016llx", vmlinux, offset);
+		fp = popen(cmd, "r");
+		if (!fp)
+			continue;
+
+		if (getline(&path, &line_len, fp) < 0 || !line_len)
+			goto next;
+
+		sym_ext[i].path = malloc(sizeof(char) * line_len);
+		if (!sym_ext[i].path)
+			goto next;
+
+		strcpy(sym_ext[i].path, path);
+
+	next:
+		pclose(fp);
+	}
+}
+
 static void annotate_sym(struct dso *dso, struct symbol *sym)
 {
 	char *filename = dso->name;
@@ -1135,6 +1225,9 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
 	end = start + sym->end - sym->start + 1;
 	len = sym->end - sym->start;
 
+	if (print_line)
+		get_source_line(sym, start, len);
+
 	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
 
 	if (verbose >= 3)
@@ -1150,6 +1243,7 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
 	}
 
 	pclose(file);
+	free_source_line(sym, len);
 }
 
 static void find_annotations(void)
@@ -1308,6 +1402,8 @@ static const struct option options[] = {
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+	OPT_BOOLEAN('l', "print-line", &print_line,
+		    "print matching source lines (may be slow)"),
 	OPT_END()
 };
 
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 0d1292bd8270..5ad9b06c3f6f 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -12,6 +12,7 @@ struct symbol {
 	__u64		obj_start;
 	__u64		hist_sum;
 	__u64		*hist;
+	void		*priv;
 	char		name[0];
 };
 

From 971738f3669092dd247eaf89658f2685180492a0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 13 Jun 2009 00:11:22 +0200
Subject: [PATCH 02/49] perf annotate: Print a sorted summary of annotated
 overhead lines

It's can be very annoying to scroll down perf annotated output
until we find relevant overhead.

Using the -l option, you can now have a small summary sorted per
overhead in the beginning of the output.

Example:

./perf annotate -l -k ../../vmlinux -s __lock_acquire

Sorted summary for file ../../vmlinux
----------------------------------------------

   12.04 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:1653
    4.61 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:1740
    3.77 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:1775
    3.56 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:1653
    2.93 /home/fweisbec/linux/linux-2.6-tip/arch/x86/include/asm/irqflags.h:15
    2.83 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:2545
    2.30 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:2594
    2.20 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:2388
    2.20 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:730
    2.09 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:730
    2.09 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:138
    1.88 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:2548
    1.47 /home/fweisbec/linux/linux-2.6-tip/arch/x86/include/asm/irqflags.h:15
    1.36 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:2594
    1.36 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:730
    1.26 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:1654
    1.26 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:1653
    1.15 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:2592
    1.15 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:1740
    1.15 /home/fweisbec/linux/linux-2.6-tip/kernel/lockdep.c:1740

[...]

Only overhead over 0.5% are summarized.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1244844682-12928-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 111 +++++++++++++++++++++++++++-------
 1 file changed, 90 insertions(+), 21 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 6a08da41f76b..7a5b27867a96 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -25,6 +25,10 @@
 #define SHOW_USER	2
 #define SHOW_HV		4
 
+#define MIN_GREEN		0.5
+#define MIN_RED		5.0
+
+
 static char		const *input_name = "perf.data";
 static char		*vmlinux = "vmlinux";
 
@@ -88,6 +92,7 @@ typedef union event_union {
 
 
 struct sym_ext {
+	struct rb_node	node;
 	double		percent;
 	char		*path;
 };
@@ -1038,6 +1043,24 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static char *get_color(double percent)
+{
+	char *color = PERF_COLOR_NORMAL;
+
+	/*
+	 * We color high-overhead entries in red, mid-overhead
+	 * entries in green - and keep the low overhead places
+	 * normal:
+	 */
+	if (percent >= MIN_RED)
+		color = PERF_COLOR_RED;
+	else {
+		if (percent > MIN_GREEN)
+			color = PERF_COLOR_GREEN;
+	}
+	return color;
+}
+
 static int
 parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 {
@@ -1086,7 +1109,7 @@ parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 		const char *path = NULL;
 		unsigned int hits = 0;
 		double percent = 0.0;
-		char *color = PERF_COLOR_NORMAL;
+		char *color;
 		struct sym_ext *sym_ext = sym->priv;
 
 		offset = line_ip - start;
@@ -1099,17 +1122,7 @@ parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 		} else if (sym->hist_sum)
 			percent = 100.0 * hits / sym->hist_sum;
 
-		/*
-		 * We color high-overhead entries in red, mid-overhead
-		 * entries in green - and keep the low overhead places
-		 * normal:
-		 */
-		if (percent >= 5.0)
-			color = PERF_COLOR_RED;
-		else {
-			if (percent > 0.5)
-				color = PERF_COLOR_GREEN;
-		}
+		color = get_color(percent);
 
 		/*
 		 * Also color the filename and line if needed, with
@@ -1138,6 +1151,28 @@ parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 	return 0;
 }
 
+static struct rb_root root_sym_ext;
+
+static void insert_source_line(struct sym_ext *sym_ext)
+{
+	struct sym_ext *iter;
+	struct rb_node **p = &root_sym_ext.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct sym_ext, node);
+
+		if (sym_ext->percent > iter->percent)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&sym_ext->node, parent, p);
+	rb_insert_color(&sym_ext->node, &root_sym_ext);
+}
+
 static void free_source_line(struct symbol *sym, int len)
 {
 	struct sym_ext *sym_ext = sym->priv;
@@ -1151,6 +1186,7 @@ static void free_source_line(struct symbol *sym, int len)
 	free(sym_ext);
 
 	sym->priv = NULL;
+	root_sym_ext = RB_ROOT;
 }
 
 /* Get the filename:line for the colored entries */
@@ -1193,12 +1229,42 @@ static void get_source_line(struct symbol *sym, __u64 start, int len)
 			goto next;
 
 		strcpy(sym_ext[i].path, path);
+		insert_source_line(&sym_ext[i]);
 
 	next:
 		pclose(fp);
 	}
 }
 
+static void print_summary(char *filename)
+{
+	struct sym_ext *sym_ext;
+	struct rb_node *node;
+
+	printf("\nSorted summary for file %s\n", filename);
+	printf("----------------------------------------------\n\n");
+
+	if (RB_EMPTY_ROOT(&root_sym_ext)) {
+		printf(" Nothing higher than %1.1f%%\n", MIN_GREEN);
+		return;
+	}
+
+	node = rb_first(&root_sym_ext);
+	while (node) {
+		double percent;
+		char *color;
+		char *path;
+
+		sym_ext = rb_entry(node, struct sym_ext, node);
+		percent = sym_ext->percent;
+		color = get_color(percent);
+		path = sym_ext->path;
+
+		color_fprintf(stdout, color, " %7.2f %s", percent, path);
+		node = rb_next(node);
+	}
+}
+
 static void annotate_sym(struct dso *dso, struct symbol *sym)
 {
 	char *filename = dso->name;
@@ -1211,13 +1277,6 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
 	if (dso == kernel_dso)
 		filename = vmlinux;
 
-	printf("\n------------------------------------------------\n");
-	printf(" Percent |	Source code & Disassembly of %s\n", filename);
-	printf("------------------------------------------------\n");
-
-	if (verbose >= 2)
-		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
-
 	start = sym->obj_start;
 	if (!start)
 		start = sym->start;
@@ -1225,8 +1284,17 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
 	end = start + sym->end - sym->start + 1;
 	len = sym->end - sym->start;
 
-	if (print_line)
+	if (print_line) {
 		get_source_line(sym, start, len);
+		print_summary(filename);
+	}
+
+	printf("\n\n------------------------------------------------\n");
+	printf(" Percent |	Source code & Disassembly of %s\n", filename);
+	printf("------------------------------------------------\n");
+
+	if (verbose >= 2)
+		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
 
 	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
 
@@ -1243,7 +1311,8 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
 	}
 
 	pclose(file);
-	free_source_line(sym, len);
+	if (print_line)
+		free_source_line(sym, len);
 }
 
 static void find_annotations(void)

From 87847b8f26cc7176ec9b239898dc7ce47a94e1a6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 13 Jun 2009 17:06:50 +1000
Subject: [PATCH 03/49] perf_counter: Fix atomic_set vs. atomic64_t type
 mismatch

Using atomic_set on an atomic64_t variable gives a compiler
warning on powerpc, and won't give the desired result at runtime.
This fixes an instance of this error in the perf_counter code.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18995.20490.979429.244883@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b685f551aa..8d14a733f222 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1283,7 +1283,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 		if (!interrupts) {
 			perf_disable();
 			counter->pmu->disable(counter);
-			atomic_set(&hwc->period_left, 0);
+			atomic64_set(&hwc->period_left, 0);
 			counter->pmu->enable(counter);
 			perf_enable();
 		}

From d5e8da6449d4ef4bac35ea9b9719a2cda02e7b39 Mon Sep 17 00:00:00 2001
From: Marti Raudsepp <marti@juffo.org>
Date: Sat, 13 Jun 2009 02:35:01 +0300
Subject: [PATCH 04/49] perf_counter: Fix stack corruption in perf_read_hw

With PERF_FORMAT_ID, perf_read_hw now needs space for up to 4 values.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8d14a733f222..e914daff03b5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1553,7 +1553,7 @@ static int perf_release(struct inode *inode, struct file *file)
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-	u64 values[3];
+	u64 values[4];
 	int n;
 
 	/*

From 4d2be1267fcfb3a4d2198fd696aec5e3dcbce60e Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Thu, 11 Jun 2009 15:28:09 +0530
Subject: [PATCH 05/49] perf_counter, x86: Check old-AMD performance monitoring
 support

AMD supports performance monitoring start from K7 (i.e. family 6),
so disable it for earlier AMD CPUs.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1244714289.6923.0.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 275bc142cd5d..3c37c3930ca1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1459,6 +1459,10 @@ static int intel_pmu_init(void)
 
 static int amd_pmu_init(void)
 {
+	/* Performance-monitoring supported from K7 and later: */
+	if (boot_cpu_data.x86 < 6)
+		return -ENODEV;
+
 	x86_pmu = amd_pmu;
 
 	switch (boot_cpu_data.x86) {

From f4db43a38f7387c3b19c9565124c06ab0c5d6e9a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sat, 13 Jun 2009 01:06:21 +0530
Subject: [PATCH 06/49] perf_counter, x86: Update AMD hw caching related event
 table

All AMD models share the same hw caching related event table.

Also complete the table with more events.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1244835381.2802.2.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 36 +++++++++++++-----------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3c37c3930ca1..77a59a5566a8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -389,23 +389,23 @@ static u64 intel_pmu_raw_event(u64 event)
 	return event & CORE_EVNTSEL_MASK;
 }
 
-static const u64 amd_0f_hw_cache_event_ids
+static const u64 amd_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
 {
  [ C(L1D) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
 	},
  },
  [ C(L1I ) ] = {
@@ -418,17 +418,17 @@ static const u64 amd_0f_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_PREFETCH) ] = {
@@ -438,8 +438,8 @@ static const u64 amd_0f_hw_cache_event_ids
  },
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = 0,
@@ -1465,16 +1465,10 @@ static int amd_pmu_init(void)
 
 	x86_pmu = amd_pmu;
 
-	switch (boot_cpu_data.x86) {
-	case 0x0f:
-	case 0x10:
-	case 0x11:
-		memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
+	/* Events are common for all AMDs */
+	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+	       sizeof(hw_cache_event_ids));
 
-		pr_cont("AMD Family 0f/10/11 events, ");
-		break;
-	}
 	return 0;
 }
 

From 44175b6f397a6724121eeaf0f072e2c912a46614 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 13 Jun 2009 13:35:00 +0200
Subject: [PATCH 07/49] perf stat: Reorganize output

 - use IPC for the instruction normalization output
 - CPUs for the CPU utilization factor value.
 - print out time elapsed like the other rows
 - tidy up the task-clocks/cpu-clocks printout

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-stat.c      | 67 ++++++++++++++++++++--------------
 tools/perf/util/parse-events.c |  4 +-
 2 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c43e4a97dc42..c12804853eab 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -184,6 +184,40 @@ static void read_counter(int counter)
 		runtime_cycles = count[0];
 }
 
+static void nsec_printout(int counter, __u64 *count)
+{
+	double msecs = (double)count[0] / 1000000;
+
+	fprintf(stderr, " %14.6f  %-20s", msecs, event_name(counter));
+
+	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
+
+		if (walltime_nsecs)
+			fprintf(stderr, " # %10.3f CPUs",
+				(double)count[0] / (double)walltime_nsecs);
+	}
+}
+
+static void abs_printout(int counter, __u64 *count)
+{
+	fprintf(stderr, " %14Ld  %-20s", count[0], event_name(counter));
+
+	if (runtime_cycles &&
+		attrs[counter].type == PERF_TYPE_HARDWARE &&
+			attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
+
+		fprintf(stderr, " # %10.3f IPC",
+			(double)count[0] / (double)runtime_cycles);
+
+		return;
+	}
+
+	if (runtime_nsecs)
+		fprintf(stderr, " # %10.3f M/sec",
+			(double)count[0]/runtime_nsecs*1000.0);
+}
+
 /*
  * Print out the results of a single counter:
  */
@@ -201,35 +235,15 @@ static void print_counter(int counter)
 		return;
 	}
 
-	if (nsec_counter(counter)) {
-		double msecs = (double)count[0] / 1000000;
+	if (nsec_counter(counter))
+		nsec_printout(counter, count);
+	else
+		abs_printout(counter, count);
 
-		fprintf(stderr, " %14.6f  %-20s",
-			msecs, event_name(counter));
-		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-			attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
-
-			if (walltime_nsecs)
-				fprintf(stderr, " # %11.3f CPU utilization factor",
-					(double)count[0] / (double)walltime_nsecs);
-		}
-	} else {
-		fprintf(stderr, " %14Ld  %-20s",
-			count[0], event_name(counter));
-		if (runtime_nsecs)
-			fprintf(stderr, " # %11.3f M/sec",
-				(double)count[0]/runtime_nsecs*1000.0);
-		if (runtime_cycles &&
-			attrs[counter].type == PERF_TYPE_HARDWARE &&
-				attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
-
-			fprintf(stderr, " # %1.3f per cycle",
-				(double)count[0] / (double)runtime_cycles);
-		}
-	}
 	if (scaled)
 		fprintf(stderr, "  (scaled from %.2f%%)",
 			(double) count[2] / count[1] * 100);
+
 	fprintf(stderr, "\n");
 }
 
@@ -290,8 +304,7 @@ static int do_perf_stat(int argc, const char **argv)
 
 
 	fprintf(stderr, "\n");
-	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
-			(double)(t1-t0)/1e6);
+	fprintf(stderr, " %14.9f  seconds time elapsed.\n", (double)(t1-t0)/1e9);
 	fprintf(stderr, "\n");
 
 	return 0;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 5a72586e1df0..f0c9f2627fe1 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -63,8 +63,8 @@ static char *hw_event_names[] = {
 };
 
 static char *sw_event_names[] = {
-	"cpu-clock-ticks",
-	"task-clock-ticks",
+	"cpu-clock-msecs",
+	"task-clock-msecs",
 	"page-faults",
 	"context-switches",
 	"CPU-migrations",

From 42202dd56c717f173cd0bf2390249e1bf5cf210b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 13 Jun 2009 14:57:28 +0200
Subject: [PATCH 08/49] perf stat: Add feature to run and measure a command
 multiple times

Add the --repeat <n> feature to perf stat, which repeats a given
command up to a 100 times, collects the stats and calculates an
average and a stddev.

For example, the following oneliner 'perf stat' command runs hackbench
5 times and prints a tabulated result of all metrics, with averages
and noise levels (in percentage) printed:

 aldebaran:~/linux/linux/tools/perf> ./perf stat --repeat 5 ~/hackbench 10
 Time: 0.117
 Time: 0.108
 Time: 0.089
 Time: 0.088
 Time: 0.100

 Performance counter stats for '/home/mingo/hackbench 10' (5 runs):

    1243.989586  task-clock-msecs     #     10.460 CPUs    ( +-   4.720% )
          47706  context-switches     #      0.038 M/sec   ( +-  19.706% )
            387  CPU-migrations       #      0.000 M/sec   ( +-   3.608% )
          17793  page-faults          #      0.014 M/sec   ( +-   0.354% )
     3770941606  cycles               #   3031.329 M/sec   ( +-   4.621% )
     1566372416  instructions         #      0.415 IPC     ( +-   2.703% )
       16783421  cache-references     #     13.492 M/sec   ( +-   5.202% )
        7128590  cache-misses         #      5.730 M/sec   ( +-   7.420% )

    0.118924455  seconds time elapsed.

The goal of this feature is to allow the reliance on these accurate
statistics and to know how many times a command has to be repeated
for the noise to go down to an acceptable level.

(The -v option can be used to see a line printed out as each run progresses.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-stat.c | 305 +++++++++++++++++++++++++++-----------
 1 file changed, 217 insertions(+), 88 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c12804853eab..9eb42b1ae784 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -43,6 +43,7 @@
 #include "util/parse-events.h"
 
 #include <sys/prctl.h>
+#include <math.h>
 
 static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
@@ -79,12 +80,34 @@ static const unsigned int default_count[] = {
 	  10000,
 };
 
-static __u64			event_res[MAX_COUNTERS][3];
-static __u64			event_scaled[MAX_COUNTERS];
+#define MAX_RUN 100
 
-static __u64			runtime_nsecs;
-static __u64			walltime_nsecs;
-static __u64			runtime_cycles;
+static int			run_count		=  1;
+static int			run_idx			=  0;
+
+static __u64			event_res[MAX_RUN][MAX_COUNTERS][3];
+static __u64			event_scaled[MAX_RUN][MAX_COUNTERS];
+
+//static __u64			event_hist[MAX_RUN][MAX_COUNTERS][3];
+
+
+static __u64			runtime_nsecs[MAX_RUN];
+static __u64			walltime_nsecs[MAX_RUN];
+static __u64			runtime_cycles[MAX_RUN];
+
+static __u64			event_res_avg[MAX_COUNTERS][3];
+static __u64			event_res_noise[MAX_COUNTERS][3];
+
+static __u64			event_scaled_avg[MAX_COUNTERS];
+
+static __u64			runtime_nsecs_avg;
+static __u64			runtime_nsecs_noise;
+
+static __u64			walltime_nsecs_avg;
+static __u64			walltime_nsecs_noise;
+
+static __u64			runtime_cycles_avg;
+static __u64			runtime_cycles_noise;
 
 static void create_perf_stat_counter(int counter)
 {
@@ -140,7 +163,7 @@ static void read_counter(int counter)
 	int cpu, nv;
 	int scaled;
 
-	count = event_res[counter];
+	count = event_res[run_idx][counter];
 
 	count[0] = count[1] = count[2] = 0;
 
@@ -151,6 +174,8 @@ static void read_counter(int counter)
 
 		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
 		assert(res == nv * sizeof(__u64));
+		close(fd[cpu][counter]);
+		fd[cpu][counter] = -1;
 
 		count[0] += single_count[0];
 		if (scale) {
@@ -162,13 +187,13 @@ static void read_counter(int counter)
 	scaled = 0;
 	if (scale) {
 		if (count[2] == 0) {
-			event_scaled[counter] = -1;
+			event_scaled[run_idx][counter] = -1;
 			count[0] = 0;
 			return;
 		}
 
 		if (count[2] < count[1]) {
-			event_scaled[counter] = 1;
+			event_scaled[run_idx][counter] = 1;
 			count[0] = (unsigned long long)
 				((double)count[0] * count[1] / count[2] + 0.5);
 		}
@@ -178,82 +203,18 @@ static void read_counter(int counter)
 	 */
 	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
 		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
-		runtime_nsecs = count[0];
+		runtime_nsecs[run_idx] = count[0];
 	if (attrs[counter].type == PERF_TYPE_HARDWARE &&
 		attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
-		runtime_cycles = count[0];
+		runtime_cycles[run_idx] = count[0];
 }
 
-static void nsec_printout(int counter, __u64 *count)
-{
-	double msecs = (double)count[0] / 1000000;
-
-	fprintf(stderr, " %14.6f  %-20s", msecs, event_name(counter));
-
-	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
-
-		if (walltime_nsecs)
-			fprintf(stderr, " # %10.3f CPUs",
-				(double)count[0] / (double)walltime_nsecs);
-	}
-}
-
-static void abs_printout(int counter, __u64 *count)
-{
-	fprintf(stderr, " %14Ld  %-20s", count[0], event_name(counter));
-
-	if (runtime_cycles &&
-		attrs[counter].type == PERF_TYPE_HARDWARE &&
-			attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
-
-		fprintf(stderr, " # %10.3f IPC",
-			(double)count[0] / (double)runtime_cycles);
-
-		return;
-	}
-
-	if (runtime_nsecs)
-		fprintf(stderr, " # %10.3f M/sec",
-			(double)count[0]/runtime_nsecs*1000.0);
-}
-
-/*
- * Print out the results of a single counter:
- */
-static void print_counter(int counter)
-{
-	__u64 *count;
-	int scaled;
-
-	count = event_res[counter];
-	scaled = event_scaled[counter];
-
-	if (scaled == -1) {
-		fprintf(stderr, " %14s  %-20s\n",
-			"<not counted>", event_name(counter));
-		return;
-	}
-
-	if (nsec_counter(counter))
-		nsec_printout(counter, count);
-	else
-		abs_printout(counter, count);
-
-	if (scaled)
-		fprintf(stderr, "  (scaled from %.2f%%)",
-			(double) count[2] / count[1] * 100);
-
-	fprintf(stderr, "\n");
-}
-
-static int do_perf_stat(int argc, const char **argv)
+static int run_perf_stat(int argc, const char **argv)
 {
 	unsigned long long t0, t1;
+	int status = 0;
 	int counter;
-	int status;
 	int pid;
-	int i;
 
 	if (!system_wide)
 		nr_cpus = 1;
@@ -277,13 +238,168 @@ static int do_perf_stat(int argc, const char **argv)
 		}
 	}
 
-	while (wait(&status) >= 0)
-		;
+	wait(&status);
 
 	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
 	t1 = rdclock();
 
-	walltime_nsecs = t1 - t0;
+	walltime_nsecs[run_idx] = t1 - t0;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		read_counter(counter);
+
+	return WEXITSTATUS(status);
+}
+
+static void print_noise(__u64 *count, __u64 *noise)
+{
+	if (run_count > 1)
+		fprintf(stderr, "   ( +- %7.3f%% )",
+			(double)noise[0]/(count[0]+1)*100.0);
+}
+
+static void nsec_printout(int counter, __u64 *count, __u64 *noise)
+{
+	double msecs = (double)count[0] / 1000000;
+
+	fprintf(stderr, " %14.6f  %-20s", msecs, event_name(counter));
+
+	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
+
+		if (walltime_nsecs_avg)
+			fprintf(stderr, " # %10.3f CPUs ",
+				(double)count[0] / (double)walltime_nsecs_avg);
+	}
+	print_noise(count, noise);
+}
+
+static void abs_printout(int counter, __u64 *count, __u64 *noise)
+{
+	fprintf(stderr, " %14Ld  %-20s", count[0], event_name(counter));
+
+	if (runtime_cycles_avg &&
+		attrs[counter].type == PERF_TYPE_HARDWARE &&
+			attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
+
+		fprintf(stderr, " # %10.3f IPC  ",
+			(double)count[0] / (double)runtime_cycles_avg);
+	} else {
+		if (runtime_nsecs_avg) {
+			fprintf(stderr, " # %10.3f M/sec",
+				(double)count[0]/runtime_nsecs_avg*1000.0);
+		}
+	}
+	print_noise(count, noise);
+}
+
+/*
+ * Print out the results of a single counter:
+ */
+static void print_counter(int counter)
+{
+	__u64 *count, *noise;
+	int scaled;
+
+	count = event_res_avg[counter];
+	noise = event_res_noise[counter];
+	scaled = event_scaled_avg[counter];
+
+	if (scaled == -1) {
+		fprintf(stderr, " %14s  %-20s\n",
+			"<not counted>", event_name(counter));
+		return;
+	}
+
+	if (nsec_counter(counter))
+		nsec_printout(counter, count, noise);
+	else
+		abs_printout(counter, count, noise);
+
+	if (scaled)
+		fprintf(stderr, "  (scaled from %.2f%%)",
+			(double) count[2] / count[1] * 100);
+
+	fprintf(stderr, "\n");
+}
+
+/*
+ * Normalize noise values down to stddev:
+ */
+static void normalize(__u64 *val)
+{
+	double res;
+
+	res = (double)*val / (run_count * sqrt((double)run_count));
+
+	*val = (__u64)res;
+}
+
+/*
+ * Calculate the averages and noises:
+ */
+static void calc_avg(void)
+{
+	int i, j;
+
+	for (i = 0; i < run_count; i++) {
+		runtime_nsecs_avg += runtime_nsecs[i];
+		walltime_nsecs_avg += walltime_nsecs[i];
+		runtime_cycles_avg += runtime_cycles[i];
+
+		for (j = 0; j < nr_counters; j++) {
+			event_res_avg[j][0] += event_res[i][j][0];
+			event_res_avg[j][1] += event_res[i][j][1];
+			event_res_avg[j][2] += event_res[i][j][2];
+			event_scaled_avg[j] += event_scaled[i][j];
+		}
+	}
+	runtime_nsecs_avg /= run_count;
+	walltime_nsecs_avg /= run_count;
+	runtime_cycles_avg /= run_count;
+
+	for (j = 0; j < nr_counters; j++) {
+		event_res_avg[j][0] /= run_count;
+		event_res_avg[j][1] /= run_count;
+		event_res_avg[j][2] /= run_count;
+	}
+
+	for (i = 0; i < run_count; i++) {
+		runtime_nsecs_noise +=
+			abs((__s64)(runtime_nsecs[i] - runtime_nsecs_avg));
+		walltime_nsecs_noise +=
+			abs((__s64)(walltime_nsecs[i] - walltime_nsecs_avg));
+		runtime_cycles_noise +=
+			abs((__s64)(runtime_cycles[i] - runtime_cycles_avg));
+
+		for (j = 0; j < nr_counters; j++) {
+			event_res_noise[j][0] +=
+				abs((__s64)(event_res[i][j][0] - event_res_avg[j][0]));
+			event_res_noise[j][1] +=
+				abs((__s64)(event_res[i][j][1] - event_res_avg[j][1]));
+			event_res_noise[j][2] +=
+				abs((__s64)(event_res[i][j][2] - event_res_avg[j][2]));
+		}
+	}
+
+	normalize(&runtime_nsecs_noise);
+	normalize(&walltime_nsecs_noise);
+	normalize(&runtime_cycles_noise);
+
+	for (j = 0; j < nr_counters; j++) {
+		normalize(&event_res_noise[j][0]);
+		normalize(&event_res_noise[j][1]);
+		normalize(&event_res_noise[j][2]);
+	}
+}
+
+static void print_stat(int argc, const char **argv)
+{
+	int i, counter;
+
+	calc_avg();
+
+	run_idx = 0;
 
 	fflush(stdout);
 
@@ -293,21 +409,19 @@ static int do_perf_stat(int argc, const char **argv)
 	for (i = 1; i < argc; i++)
 		fprintf(stderr, " %s", argv[i]);
 
-	fprintf(stderr, "\':\n");
-	fprintf(stderr, "\n");
-
-	for (counter = 0; counter < nr_counters; counter++)
-		read_counter(counter);
+	fprintf(stderr, "\'");
+	if (run_count > 1)
+		fprintf(stderr, " (%d runs)", run_count);
+	fprintf(stderr, ":\n\n");
 
 	for (counter = 0; counter < nr_counters; counter++)
 		print_counter(counter);
 
 
 	fprintf(stderr, "\n");
-	fprintf(stderr, " %14.9f  seconds time elapsed.\n", (double)(t1-t0)/1e9);
+	fprintf(stderr, " %14.9f  seconds time elapsed.\n",
+			(double)walltime_nsecs_avg/1e9);
 	fprintf(stderr, "\n");
-
-	return 0;
 }
 
 static volatile int signr = -1;
@@ -345,11 +459,15 @@ static const struct option options[] = {
 			    "scale/normalize counters"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
+	OPT_INTEGER('r', "repeat", &run_count,
+		    "repeat command and print average + stddev (max: 100)"),
 	OPT_END()
 };
 
 int cmd_stat(int argc, const char **argv, const char *prefix)
 {
+	int status;
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	memcpy(attrs, default_attrs, sizeof(attrs));
@@ -357,6 +475,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
 	argc = parse_options(argc, argv, options, stat_usage, 0);
 	if (!argc)
 		usage_with_options(stat_usage, options);
+	if (run_count <= 0 || run_count > MAX_RUN)
+		usage_with_options(stat_usage, options);
 
 	if (!nr_counters)
 		nr_counters = 8;
@@ -376,5 +496,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
 	signal(SIGALRM, skip_signal);
 	signal(SIGABRT, skip_signal);
 
-	return do_perf_stat(argc, argv);
+	status = 0;
+	for (run_idx = 0; run_idx < run_count; run_idx++) {
+		if (run_count != 1 && verbose)
+			fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx+1);
+		status = run_perf_stat(argc, argv);
+	}
+
+	print_stat(argc, argv);
+
+	return status;
 }

From ef281a196d66b8bc2d067a3704712e5b93691fbc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 13 Jun 2009 15:40:35 +0200
Subject: [PATCH 09/49] perf stat: Enable raw data to be printed

If -vv (very verbose) is specified, print out raw data
in the following format:

$ perf stat -vv -r 3 ./loop_1b_instructions

[ perf stat: executing run #1 ... ]
[ perf stat: executing run #2 ... ]
[ perf stat: executing run #3 ... ]

debug:              runtime[0]: 235871872
debug:             walltime[0]: 236646752
debug:       runtime_cycles[0]: 755150182
debug:            counter/0[0]: 235871872
debug:            counter/1[0]: 235871872
debug:            counter/2[0]: 235871872
debug:               scaled[0]: 0
debug:            counter/0[1]: 2
debug:            counter/1[1]: 235870662
debug:            counter/2[1]: 235870662
debug:               scaled[1]: 0
debug:            counter/0[2]: 1
debug:            counter/1[2]: 235870437
debug:            counter/2[2]: 235870437
debug:               scaled[2]: 0
debug:            counter/0[3]: 140
debug:            counter/1[3]: 235870298
debug:            counter/2[3]: 235870298
debug:               scaled[3]: 0
debug:            counter/0[4]: 755150182
debug:            counter/1[4]: 235870145
debug:            counter/2[4]: 235870145
debug:               scaled[4]: 0
debug:            counter/0[5]: 1001411258
debug:            counter/1[5]: 235868838
debug:            counter/2[5]: 235868838
debug:               scaled[5]: 0
debug:            counter/0[6]: 27897
debug:            counter/1[6]: 235868560
debug:            counter/2[6]: 235868560
debug:               scaled[6]: 0
debug:            counter/0[7]: 2910
debug:            counter/1[7]: 235868151
debug:            counter/2[7]: 235868151
debug:               scaled[7]: 0
debug:              runtime[0]: 235980257
debug:             walltime[0]: 236770942
debug:       runtime_cycles[0]: 755114546
debug:            counter/0[0]: 235980257
debug:            counter/1[0]: 235980257
debug:            counter/2[0]: 235980257
debug:               scaled[0]: 0
debug:            counter/0[1]: 3
debug:            counter/1[1]: 235980049
debug:            counter/2[1]: 235980049
debug:               scaled[1]: 0
debug:            counter/0[2]: 1
debug:            counter/1[2]: 235979907
debug:            counter/2[2]: 235979907
debug:               scaled[2]: 0
debug:            counter/0[3]: 135
debug:            counter/1[3]: 235979780
debug:            counter/2[3]: 235979780
debug:               scaled[3]: 0
debug:            counter/0[4]: 755114546
debug:            counter/1[4]: 235979652
debug:            counter/2[4]: 235979652
debug:               scaled[4]: 0
debug:            counter/0[5]: 1001439771
debug:            counter/1[5]: 235979304
debug:            counter/2[5]: 235979304
debug:               scaled[5]: 0
debug:            counter/0[6]: 23723
debug:            counter/1[6]: 235979050
debug:            counter/2[6]: 235979050
debug:               scaled[6]: 0
debug:            counter/0[7]: 2213
debug:            counter/1[7]: 235978820
debug:            counter/2[7]: 235978820
debug:               scaled[7]: 0
debug:              runtime[0]: 235888002
debug:             walltime[0]: 236700533
debug:       runtime_cycles[0]: 754881504
debug:            counter/0[0]: 235888002
debug:            counter/1[0]: 235888002
debug:            counter/2[0]: 235888002
debug:               scaled[0]: 0
debug:            counter/0[1]: 2
debug:            counter/1[1]: 235887793
debug:            counter/2[1]: 235887793
debug:               scaled[1]: 0
debug:            counter/0[2]: 1
debug:            counter/1[2]: 235887645
debug:            counter/2[2]: 235887645
debug:               scaled[2]: 0
debug:            counter/0[3]: 135
debug:            counter/1[3]: 235887499
debug:            counter/2[3]: 235887499
debug:               scaled[3]: 0
debug:            counter/0[4]: 754881504
debug:            counter/1[4]: 235887368
debug:            counter/2[4]: 235887368
debug:               scaled[4]: 0
debug:            counter/0[5]: 1001401731
debug:            counter/1[5]: 235887024
debug:            counter/2[5]: 235887024
debug:               scaled[5]: 0
debug:            counter/0[6]: 24212
debug:            counter/1[6]: 235886786
debug:            counter/2[6]: 235886786
debug:               scaled[6]: 0
debug:            counter/0[7]: 1824
debug:            counter/1[7]: 235886560
debug:            counter/2[7]: 235886560
debug:               scaled[7]: 0

 Performance counter stats for '/home/mingo/loop_1b_instructions' (3 runs):

     235.913377  task-clock-msecs     #      0.997 CPUs    ( +-   0.011% )
              2  context-switches     #      0.000 M/sec   ( +-   0.000% )
              1  CPU-migrations       #      0.000 M/sec   ( +-   0.000% )
            136  page-faults          #      0.001 M/sec   ( +-   0.730% )
      755048744  cycles               #   3200.534 M/sec   ( +-   0.009% )
     1001417586  instructions         #      1.326 IPC     ( +-   0.001% )
          25277  cache-references     #      0.107 M/sec   ( +-   3.988% )
           2315  cache-misses         #      0.010 M/sec   ( +-   9.845% )

    0.236706075  seconds time elapsed.

This allows the summary stats to be validated.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Makefile       |  2 +-
 tools/perf/builtin-stat.c | 46 ++++++++++++++++++++++++---------------
 2 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 0cbd5d6874ec..e8346f95fbb0 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -160,7 +160,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
 CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
-LDFLAGS = -lpthread -lrt -lelf
+LDFLAGS = -lpthread -lrt -lelf -lm
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
 STRIP ?= strip
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 9eb42b1ae784..e5b3c0ff03a9 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -324,9 +324,9 @@ static void print_counter(int counter)
 }
 
 /*
- * Normalize noise values down to stddev:
+ * normalize_noise noise values down to stddev:
  */
-static void normalize(__u64 *val)
+static void normalize_noise(__u64 *val)
 {
 	double res;
 
@@ -335,6 +335,13 @@ static void normalize(__u64 *val)
 	*val = (__u64)res;
 }
 
+static void update_avg(const char *name, int idx, __u64 *avg, __u64 *val)
+{
+	*avg += *val;
+
+	if (verbose > 1)
+		fprintf(stderr, "debug: %20s[%d]: %Ld\n", name, idx, *val);
+}
 /*
  * Calculate the averages and noises:
  */
@@ -342,16 +349,23 @@ static void calc_avg(void)
 {
 	int i, j;
 
+	if (verbose > 1)
+		fprintf(stderr, "\n");
+
 	for (i = 0; i < run_count; i++) {
-		runtime_nsecs_avg += runtime_nsecs[i];
-		walltime_nsecs_avg += walltime_nsecs[i];
-		runtime_cycles_avg += runtime_cycles[i];
+		update_avg("runtime", 0, &runtime_nsecs_avg, runtime_nsecs + i);
+		update_avg("walltime", 0, &walltime_nsecs_avg, walltime_nsecs + i);
+		update_avg("runtime_cycles", 0, &runtime_cycles_avg, runtime_cycles + i);
 
 		for (j = 0; j < nr_counters; j++) {
-			event_res_avg[j][0] += event_res[i][j][0];
-			event_res_avg[j][1] += event_res[i][j][1];
-			event_res_avg[j][2] += event_res[i][j][2];
-			event_scaled_avg[j] += event_scaled[i][j];
+			update_avg("counter/0", j,
+				event_res_avg[j]+0, event_res[i][j]+0);
+			update_avg("counter/1", j,
+				event_res_avg[j]+1, event_res[i][j]+1);
+			update_avg("counter/2", j,
+				event_res_avg[j]+2, event_res[i][j]+2);
+			update_avg("scaled", j,
+				event_scaled_avg + j, event_scaled[i]+j);
 		}
 	}
 	runtime_nsecs_avg /= run_count;
@@ -382,14 +396,14 @@ static void calc_avg(void)
 		}
 	}
 
-	normalize(&runtime_nsecs_noise);
-	normalize(&walltime_nsecs_noise);
-	normalize(&runtime_cycles_noise);
+	normalize_noise(&runtime_nsecs_noise);
+	normalize_noise(&walltime_nsecs_noise);
+	normalize_noise(&runtime_cycles_noise);
 
 	for (j = 0; j < nr_counters; j++) {
-		normalize(&event_res_noise[j][0]);
-		normalize(&event_res_noise[j][1]);
-		normalize(&event_res_noise[j][2]);
+		normalize_noise(&event_res_noise[j][0]);
+		normalize_noise(&event_res_noise[j][1]);
+		normalize_noise(&event_res_noise[j][2]);
 	}
 }
 
@@ -399,8 +413,6 @@ static void print_stat(int argc, const char **argv)
 
 	calc_avg();
 
-	run_idx = 0;
-
 	fflush(stdout);
 
 	fprintf(stderr, "\n");

From c17c2db1f3cea41c3543025905d3582c6937dd95 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 13 Jun 2009 17:39:23 +0200
Subject: [PATCH 10/49] perf annotate: Fixes for filename:line displays

- fix addr2line on userspace binary: don't only check kernel image.
- fix string allocation size for path: missing ending null char room
- fix overflow in symbol extra info

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1244907563-7820-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 7a5b27867a96..94cea678fd7e 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -1116,7 +1116,7 @@ parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 		if (offset < len)
 			hits = sym->hist[offset];
 
-		if (sym_ext) {
+		if (offset < len && sym_ext) {
 			path = sym_ext[offset].path;
 			percent = sym_ext[offset].percent;
 		} else if (sym->hist_sum)
@@ -1190,7 +1190,8 @@ static void free_source_line(struct symbol *sym, int len)
 }
 
 /* Get the filename:line for the colored entries */
-static void get_source_line(struct symbol *sym, __u64 start, int len)
+static void
+get_source_line(struct symbol *sym, __u64 start, int len, char *filename)
 {
 	int i;
 	char cmd[PATH_MAX * 2];
@@ -1216,7 +1217,7 @@ static void get_source_line(struct symbol *sym, __u64 start, int len)
 			continue;
 
 		offset = start + i;
-		sprintf(cmd, "addr2line -e %s %016llx", vmlinux, offset);
+		sprintf(cmd, "addr2line -e %s %016llx", filename, offset);
 		fp = popen(cmd, "r");
 		if (!fp)
 			continue;
@@ -1224,7 +1225,7 @@ static void get_source_line(struct symbol *sym, __u64 start, int len)
 		if (getline(&path, &line_len, fp) < 0 || !line_len)
 			goto next;
 
-		sym_ext[i].path = malloc(sizeof(char) * line_len);
+		sym_ext[i].path = malloc(sizeof(char) * line_len + 1);
 		if (!sym_ext[i].path)
 			goto next;
 
@@ -1285,7 +1286,7 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
 	len = sym->end - sym->start;
 
 	if (print_line) {
-		get_source_line(sym, start, len);
+		get_source_line(sym, start, len, filename);
 		print_summary(filename);
 	}
 

From 8465b05046652cfde3d47692cab2e8ba962f140f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Jun 2009 14:44:07 +0200
Subject: [PATCH 11/49] perf report: Print out raw events in hexa

Print out events in hexa dump format, when -D is specified:

0x4868 [0x48]: event: 1
.
. ... raw event: size 72 bytes
.  0000:  01 00 00 00 00 00 48 00 d4 72 00 00 d4 72 00 00  ......H..r...r.
.  0010:  00 00 40 f2 3e 00 00 00 00 30 01 00 00 00 00 00  ..@.>....0.....
.  0020:  00 00 00 00 00 00 00 00 2f 75 73 72 2f 6c 69 62  ......../usr/li
.  0030:  36 34 2f 6c 69 62 65 6c 66 2d 30 2e 31 34 31 2e  64/libelf-0.141
.  0040:  73 6f 00 00 00 00 00 00                          f-0.141
.
0x4868 [0x48]: PERF_EVENT_MMAP 29396: [0x3ef2400000(0x13000) @ (nil)]: /usr/lib64/libelf-0.141.so

This helps the debugging of mis-parsing of data files, and helps
the addition of new sample/trace formats.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 82fa93b4db99..37515da637f7 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1095,9 +1095,43 @@ process_period_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static void trace_event(event_t *event)
+{
+	unsigned char *raw_event = (void *)event;
+	int i, j;
+
+	if (!dump_trace)
+		return;
+
+	dprintf(".\n. ... raw event: size %d bytes\n", event->header.size);
+
+	for (i = 0; i < event->header.size; i++) {
+		if ((i & 15) == 0)
+			dprintf(".  %04x: ", i);
+
+		dprintf(" %02x", raw_event[i]);
+
+		if (((i & 15) == 15) || i == event->header.size-1) {
+			dprintf("  ");
+			for (j = 0; j < 15-(i & 15); j++)
+				dprintf("   ");
+			for (j = 0; j < (i & 15); j++) {
+				if (isprint(raw_event[i-15+j]))
+					dprintf("%c", raw_event[i-15+j]);
+				else
+					dprintf(".");
+			}
+			dprintf("\n");
+		}
+	}
+	dprintf(".\n");
+}
+
 static int
 process_event(event_t *event, unsigned long offset, unsigned long head)
 {
+	trace_event(event);
+
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
 		return process_overflow_event(event, offset, head);
 
@@ -1204,7 +1238,7 @@ more:
 
 	size = event->header.size;
 
-	dprintf("%p [%p]: event: %d\n",
+	dprintf("\n%p [%p]: event: %d\n",
 			(void *)(offset + head),
 			(void *)(long)event->header.size,
 			event->header.type);

From 3efa1cc99ec51bc7a7ae0011a16619fd20dbe6ea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Jun 2009 15:04:15 +0200
Subject: [PATCH 12/49] perf record/report: Add call graph / call chain
 profiling

Add the first steps of call-graph profiling:

 - add the -c (--call-graph) option to perf record
 - parse the call-graph record and printout out under -D (--dump-trace)

The call-graph data is not put into the histogram yet, but it
can be seen that it's being processed correctly:

0x3ce0 [0x38]: event: 35
.
. ... raw event: size 56 bytes
.  0000:  23 00 00 00 05 00 38 00 d4 df 0e 81 ff ff ff ff  #.....8........
.  0010:  60 0b 00 00 60 0b 00 00 03 00 00 00 01 00 02 00  `...`..........
.  0020:  d4 df 0e 81 ff ff ff ff a0 61 ed 41 36 00 00 00  .........a.A6..
.  0030:  04 92 e6 41 36 00 00 00                          .a.A6..
.
0x3ce0 [0x38]: PERF_EVENT (IP, 5): 2912: 0xffffffff810edfd4 period: 1
... chain: u:2, k:1, nr:3
.....  0: 0xffffffff810edfd4
.....  1: 0x3641ed61a0
.....  2: 0x3641e69204
 ... thread: perf:2912
 ...... dso: [kernel]

This shows a 3-entry call-graph: with 1 kernel-space and two user-space
entries

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c |  8 ++++++
 tools/perf/builtin-report.c | 57 +++++++++++++++++++++++++++++--------
 2 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0f5771f615da..a177a591b52c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -37,6 +37,7 @@ static pid_t			target_pid			= -1;
 static int			inherit				= 1;
 static int			force				= 0;
 static int			append_file			= 0;
+static int			call_graph			= 0;
 static int			verbose				= 0;
 
 static long			samples;
@@ -351,11 +352,16 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	int track = 1;
 
 	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+
 	if (freq) {
 		attr->sample_type	|= PERF_SAMPLE_PERIOD;
 		attr->freq		= 1;
 		attr->sample_freq	= freq;
 	}
+
+	if (call_graph)
+		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
+
 	attr->mmap		= track;
 	attr->comm		= track;
 	attr->inherit		= (cpu < 0) && inherit;
@@ -555,6 +561,8 @@ static const struct option options[] = {
 		    "profile at this frequency"),
 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
 		    "number of mmap data pages"),
+	OPT_BOOLEAN('g', "call-graph", &call_graph,
+		    "do call-graph (stack chain/backtrace) recording"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_END()
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 37515da637f7..aebba5659345 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -36,6 +36,7 @@ static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int		dump_trace = 0;
 #define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
+#define cdprintf(x...)	do { if (dump_trace) color_fprintf(stdout, color, x); } while (0)
 
 static int		verbose;
 static int		full_paths;
@@ -43,11 +44,19 @@ static int		full_paths;
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
+struct ip_chain_event {
+	__u16 nr;
+	__u16 hv;
+	__u16 kernel;
+	__u16 user;
+	__u64 ips[];
+};
+
 struct ip_event {
 	struct perf_event_header header;
 	__u64 ip;
 	__u32 pid, tid;
-	__u64 period;
+	unsigned char __more_data[];
 };
 
 struct mmap_event {
@@ -944,9 +953,13 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	__u64 ip = event->ip.ip;
 	__u64 period = 1;
 	struct map *map = NULL;
+	void *more_data = event->ip.__more_data;
+	struct ip_chain_event *chain;
 
-	if (event->header.type & PERF_SAMPLE_PERIOD)
-		period = event->ip.period;
+	if (event->header.type & PERF_SAMPLE_PERIOD) {
+		period = *(__u64 *)more_data;
+		more_data += sizeof(__u64);
+	}
 
 	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
 		(void *)(offset + head),
@@ -956,6 +969,22 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 		(void *)(long)ip,
 		(long long)period);
 
+	if (event->header.type & PERF_SAMPLE_CALLCHAIN) {
+		int i;
+
+		chain = (void *)more_data;
+
+		if (dump_trace) {
+			dprintf("... chain: u:%d, k:%d, nr:%d\n",
+				chain->user,
+				chain->kernel,
+				chain->nr);
+
+			for (i = 0; i < chain->nr; i++)
+				dprintf("..... %2d: %p\n", i, (void *)chain->ips[i]);
+		}
+	}
+
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
 	if (thread == NULL) {
@@ -1098,30 +1127,34 @@ process_period_event(event_t *event, unsigned long offset, unsigned long head)
 static void trace_event(event_t *event)
 {
 	unsigned char *raw_event = (void *)event;
+	char *color = PERF_COLOR_BLUE;
 	int i, j;
 
 	if (!dump_trace)
 		return;
 
-	dprintf(".\n. ... raw event: size %d bytes\n", event->header.size);
+	dprintf(".");
+	cdprintf("\n. ... raw event: size %d bytes\n", event->header.size);
 
 	for (i = 0; i < event->header.size; i++) {
-		if ((i & 15) == 0)
-			dprintf(".  %04x: ", i);
+		if ((i & 15) == 0) {
+			dprintf(".");
+			cdprintf("  %04x: ", i);
+		}
 
-		dprintf(" %02x", raw_event[i]);
+		cdprintf(" %02x", raw_event[i]);
 
 		if (((i & 15) == 15) || i == event->header.size-1) {
-			dprintf("  ");
+			cdprintf("  ");
 			for (j = 0; j < 15-(i & 15); j++)
-				dprintf("   ");
+				cdprintf("   ");
 			for (j = 0; j < (i & 15); j++) {
 				if (isprint(raw_event[i-15+j]))
-					dprintf("%c", raw_event[i-15+j]);
+					cdprintf("%c", raw_event[i-15+j]);
 				else
-					dprintf(".");
+					cdprintf(".");
 			}
-			dprintf("\n");
+			cdprintf("\n");
 		}
 	}
 	dprintf(".\n");

From 5a6cec3abbdb74244caab68db100825a8c4ac02d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 11:25:09 +0200
Subject: [PATCH 13/49] perf_counter, x86: Fix call-chain walking

Fix the ptregs variant when we hit user-mode tasks.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 77a59a5566a8..09d8cb69c3f3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1644,7 +1644,9 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	const void __user *fp;
 	int nr = entry->nr;
 
-	regs = (struct pt_regs *)current->thread.sp0 - 1;
+	if (!user_mode(regs))
+		regs = task_pt_regs(current);
+
 	fp   = (void __user *)regs->bp;
 
 	callchain_store(entry, regs->ip);
@@ -1656,7 +1658,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		if (!copy_stack_frame(fp, &frame))
 			break;
 
-		if ((unsigned long)fp < user_stack_pointer(regs))
+		if ((unsigned long)fp < regs->sp)
 			break;
 
 		callchain_store(entry, frame.return_address);

From 038e836e97e70c4ad2b5058b07fc7207f50b59dd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Jun 2009 09:57:59 +0200
Subject: [PATCH 14/49] perf_counter, x86: Fix kernel-space call-chains

Kernel-space call-chains were trimmed at the first entry because
we never processed anything beyond the first stack context.

Allow the backtrace to jump from NMI to IRQ stack then to task stack
and finally user-space stack.

Also calculate the stack and bp variables correctly so that the
stack walker does not exit early.

We can get deep traces as a result, visible in perf report -D output:

0x32af0 [0xe0]: PERF_EVENT (IP, 5): 15134: 0xffffffff815225fd period: 1
... chain: u:2, k:22, nr:24
.....  0: 0xffffffff815225fd
.....  1: 0xffffffff810ac51c
.....  2: 0xffffffff81018e29
.....  3: 0xffffffff81523939
.....  4: 0xffffffff81524b8f
.....  5: 0xffffffff81524bd9
.....  6: 0xffffffff8105e498
.....  7: 0xffffffff8152315a
.....  8: 0xffffffff81522c3a
.....  9: 0xffffffff810d9b74
..... 10: 0xffffffff810dbeec
..... 11: 0xffffffff810dc3fb

This is a 22-entries kernel-space chain.

(We still only record reliable stack entries.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 09d8cb69c3f3..6d5e7cfd97e7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1575,8 +1575,8 @@ static void backtrace_warning(void *data, char *msg)
 
 static int backtrace_stack(void *data, char *name)
 {
-	/* Don't bother with IRQ stacks for now */
-	return -1;
+	/* Process all stacks: */
+	return 0;
 }
 
 static void backtrace_address(void *data, unsigned long addr, int reliable)
@@ -1594,6 +1594,8 @@ static const struct stacktrace_ops backtrace_ops = {
 	.address		= backtrace_address,
 };
 
+#include "../dumpstack.h"
+
 static void
 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
@@ -1601,26 +1603,20 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	char *stack;
 	int nr = entry->nr;
 
-	callchain_store(entry, instruction_pointer(regs));
+	callchain_store(entry, regs->ip);
 
 	stack = ((char *)regs + sizeof(struct pt_regs));
 #ifdef CONFIG_FRAME_POINTER
-	bp = frame_pointer(regs);
+	get_bp(bp);
 #else
 	bp = 0;
 #endif
 
-	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+	dump_trace(NULL, regs, (void *)&stack, bp, &backtrace_ops, entry);
 
 	entry->kernel = entry->nr - nr;
 }
 
-
-struct stack_frame {
-	const void __user	*next_fp;
-	unsigned long		return_address;
-};
-
 static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
 {
 	int ret;
@@ -1652,7 +1648,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	callchain_store(entry, regs->ip);
 
 	while (entry->nr < MAX_STACK_DEPTH) {
-		frame.next_fp	     = NULL;
+		frame.next_frame	     = NULL;
 		frame.return_address = 0;
 
 		if (!copy_stack_frame(fp, &frame))
@@ -1662,7 +1658,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 			break;
 
 		callchain_store(entry, frame.return_address);
-		fp = frame.next_fp;
+		fp = frame.next_frame;
 	}
 
 	entry->user = entry->nr - nr;

From 613d8602292165f86ba1969784fea01a06d55900 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Jun 2009 08:17:12 +0200
Subject: [PATCH 15/49] perf record: Fix fast task-exit race

Recording with -a (or with -p) can race with tasks going away:

   couldn't open /proc/8440/maps

Causing an early exit() and no recording done.

Do not abort the recording session - instead just skip that task.

Also, only print the warnings under -v.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a177a591b52c..e1dfef24887f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -202,8 +202,12 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 
 	fd = open(filename, O_RDONLY);
 	if (fd < 0) {
-		fprintf(stderr, "couldn't open %s\n", filename);
-		exit(EXIT_FAILURE);
+		/*
+		 * We raced with a task exiting - just return:
+		 */
+		if (verbose)
+			fprintf(stderr, "couldn't open %s\n", filename);
+		return;
 	}
 	if (read(fd, bf, sizeof(bf)) < 0) {
 		fprintf(stderr, "couldn't read %s\n", filename);
@@ -273,8 +277,12 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 
 	fp = fopen(filename, "r");
 	if (fp == NULL) {
-		fprintf(stderr, "couldn't open %s\n", filename);
-		exit(EXIT_FAILURE);
+		/*
+		 * We raced with a task exiting - just return:
+		 */
+		if (verbose)
+			fprintf(stderr, "couldn't open %s\n", filename);
+		return;
 	}
 	while (1) {
 		char bf[BUFSIZ], *pbf = bf;

From 75f937f24bd9c003dcb9d7d5509f23459f1f6000 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Jun 2009 15:05:12 +0200
Subject: [PATCH 16/49] perf_counter: Fix ctx->mutex vs counter->mutex
 inversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simon triggered a lockdep inversion report about us taking ctx->mutex
vs counter->mutex in inverse orders. Fix that up.

Reported-by: Simon Holm Thøgersen <odie@cs.aau.dk>
Tested-by: Simon Holm Thøgersen <odie@cs.aau.dk>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e914daff03b5..109a95723859 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1620,22 +1620,6 @@ static void perf_counter_reset(struct perf_counter *counter)
 	perf_counter_update_userpage(counter);
 }
 
-static void perf_counter_for_each_sibling(struct perf_counter *counter,
-					  void (*func)(struct perf_counter *))
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_counter *sibling;
-
-	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	counter = counter->group_leader;
-
-	func(counter);
-	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
-		func(sibling);
-	mutex_unlock(&ctx->mutex);
-}
-
 /*
  * Holding the top-level counter's child_mutex means that any
  * descendant process that has inherited this counter will block
@@ -1658,14 +1642,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 static void perf_counter_for_each(struct perf_counter *counter,
 				  void (*func)(struct perf_counter *))
 {
-	struct perf_counter *child;
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *sibling;
 
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	mutex_lock(&counter->child_mutex);
-	perf_counter_for_each_sibling(counter, func);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_for_each_sibling(child, func);
-	mutex_unlock(&counter->child_mutex);
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	counter = counter->group_leader;
+
+	perf_counter_for_each_child(counter, func);
+	func(counter);
+	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+		perf_counter_for_each_child(counter, func);
+	mutex_unlock(&ctx->mutex);
 }
 
 static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)

From 465a454f254ee2ff7acc4aececbe31f8af046bc0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Jun 2009 12:31:37 +0200
Subject: [PATCH 17/49] x86, mm: Add __get_user_pages_fast()

Introduce a gup_fast() variant which is usable from IRQ/NMI context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Nick Piggin <npiggin@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/gup.c  | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |  6 +++++
 2 files changed, 62 insertions(+)

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 6340cef6798a..697d5727c119 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -219,6 +219,62 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 	return 1;
 }
 
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	unsigned long flags;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+					(void __user *)start, len)))
+		return 0;
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables and pages from being freed on x86.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86, with the above PAE exception), we can follow the
+	 * address down to the the page and take a ref on it.
+	 */
+	local_irq_save(flags);
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			break;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			break;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_restore(flags);
+
+	return nr;
+}
+
 /**
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ad613ed66ab0..b457bc047ab1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -862,6 +862,12 @@ extern int mprotect_fixup(struct vm_area_struct *vma,
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 
+/*
+ * doesn't attempt to fault and will return short.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages);
+
 /*
  * A callback you can register to apply pressure to ageable caches.
  *

From 3ff0141aa3a03ca3388b40b36167d0a37919f3fd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Jun 2009 12:40:41 +0200
Subject: [PATCH 18/49] x86: Add NMI types for kmap_atomic

Two new kmap_atomic slots for NMI context. And teach pte_offset_map()
about NMI context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Nick Piggin <npiggin@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/kmap_types.h | 4 +++-
 arch/x86/include/asm/pgtable_32.h | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
index 5759c165a5cf..ff00a44b7d0d 100644
--- a/arch/x86/include/asm/kmap_types.h
+++ b/arch/x86/include/asm/kmap_types.h
@@ -21,7 +21,9 @@ D(9)	KM_IRQ0,
 D(10)	KM_IRQ1,
 D(11)	KM_SOFTIRQ0,
 D(12)	KM_SOFTIRQ1,
-D(13)	KM_TYPE_NR
+D(13)	KM_NMI,
+D(14)	KM_NMI_PTE,
+D(15)	KM_TYPE_NR
 };
 
 #undef D
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 31bd120cf2a2..85464971bca0 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -49,13 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
 #endif
 
 #if defined(CONFIG_HIGHPTE)
+#define __KM_PTE	(in_nmi() ? KM_NMI_PTE : KM_PTE0)
 #define pte_offset_map(dir, address)					\
-	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) +		\
+	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) +		\
 	 pte_index((address)))
 #define pte_offset_map_nested(dir, address)				\
 	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) +		\
 	 pte_index((address)))
-#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
+#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
 #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
 #else
 #define pte_offset_map(dir, address)					\

From 74193ef0ecab92535c8517f082f1f50504526c9b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Jun 2009 13:07:24 +0200
Subject: [PATCH 19/49] perf_counter: x86: Fix call-chain support to use
 NMI-safe methods

__copy_from_user_inatomic() isn't NMI safe in that it can trigger
the page fault handler which is another trap and its return path
invokes IRET which will also close the NMI context.

Therefore use a GUP based approach to copy the stack frames over.

We tried an alternative solution as well: we used a forward ported
version of Mathieu Desnoyers's "NMI safe INT3 and Page Fault" patch
that modifies the exception return path to use an open-coded IRET with
explicit stack unrolling and TF checking.

This didnt work as it interacted with faulting user-space instructions,
causing them not to restart properly, which corrupts user-space
registers.

Solving that would probably involve disassembling those instructions
and backtracing the RIP. But even without that, the code was deemed
rather complex to the already non-trivial x86 entry assembly code,
so instead we went for this GUP based method that does a
software-walk of the pagetables.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 49 ++++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6d5e7cfd97e7..e8c68a5091df 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -19,6 +19,7 @@
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 #include <linux/uaccess.h>
+#include <linux/highmem.h>
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
@@ -1617,20 +1618,48 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	entry->kernel = entry->nr - nr;
 }
 
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 {
+	unsigned long offset, addr = (unsigned long)from;
+	int type = in_nmi() ? KM_NMI : KM_IRQ0;
+	unsigned long size, len = 0;
+	struct page *page;
+	void *map;
 	int ret;
 
-	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
-		return 0;
+	do {
+		ret = __get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret)
+			break;
 
-	ret = 1;
-	pagefault_disable();
-	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
-		ret = 0;
-	pagefault_enable();
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
 
-	return ret;
+		map = kmap_atomic(page, type);
+		memcpy(to, map+offset, size);
+		kunmap_atomic(map, type);
+		put_page(page);
+
+		len  += size;
+		to   += size;
+		addr += size;
+
+	} while (len < n);
+
+	return len;
+}
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	unsigned long bytes;
+
+	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
+
+	return bytes == sizeof(*frame);
 }
 
 static void
@@ -1643,7 +1672,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	if (!user_mode(regs))
 		regs = task_pt_regs(current);
 
-	fp   = (void __user *)regs->bp;
+	fp = (void __user *)regs->bp;
 
 	callchain_store(entry, regs->ip);
 

From 3dfabc74c65904c9e6cf952391312d16ea772ef5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Jun 2009 11:24:38 +0200
Subject: [PATCH 20/49] perf report: Add per system call overhead histogram

Take advantage of call-graph percounter sampling/recording to
display a non-trivial histogram: the true, collapsed/summarized
cost measurement, on a per system call total overhead basis:

 aldebaran:~/linux/linux/tools/perf> ./perf record -g -a -f ~/hackbench 10
 aldebaran:~/linux/linux/tools/perf> ./perf report -s symbol --syscalls | head -10
 #
 # (3536 samples)
 #
 # Overhead  Symbol
 # ........  ......
 #
     40.75%  [k] sys_write
     40.21%  [k] sys_read
      4.44%  [k] do_nmi
 ...

This is done by accounting each (reliable) call-chain that chains back
to a given system call to that system call function.

[ So in the above example we can see that hackbench spends about 40% of
  its total time somewhere in sys_write() and 40% somewhere in
  sys_read(), the rest of the time is spent in user-space. The time
  is not spent in sys_write() _itself_ but in one of its many child
  functions. ]

Or, a recording of a (source files are already in the page-cache) kernel build:

 $ perf record -g -m 512 -f -- make -j32 kernel
 $ perf report -s s --syscalls | grep '\[k\]' | grep -v nmi

     4.14%  [k] do_page_fault
     1.20%  [k] sys_write
     1.10%  [k] sys_open
     0.63%  [k] sys_exit_group
     0.48%  [k] smp_apic_timer_interrupt
     0.37%  [k] sys_read
     0.37%  [k] sys_execve
     0.20%  [k] sys_mmap
     0.18%  [k] sys_close
     0.14%  [k] sys_munmap
     0.13%  [k] sys_poll
     0.09%  [k] sys_newstat
     0.07%  [k] sys_clone
     0.06%  [k] sys_newfstat
     0.05%  [k] sys_access
     0.05%  [k] schedule

Shows the true total cost of each syscall variant that gets used
during a kernel build. This profile reveals it that pagefaults are
the costliest, followed by read()/write().

An interesting detail: timer interrupts cost 0.5% - or 0.5 seconds
per 100 seconds of kernel build-time. (this was done with HZ=1000)

The summary is done in 'perf report', i.e. in the post-processing
stage - so once we have a good call-graph recording, this type of
non-trivial high-level analysis becomes possible.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index aebba5659345..1e2f5dde312c 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -40,6 +40,7 @@ static int		dump_trace = 0;
 
 static int		verbose;
 static int		full_paths;
+static int		collapse_syscalls;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -983,6 +984,15 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 			for (i = 0; i < chain->nr; i++)
 				dprintf("..... %2d: %p\n", i, (void *)chain->ips[i]);
 		}
+		if (collapse_syscalls) {
+			/*
+			 * Find the all-but-last kernel entry
+			 * amongst the call-chains - to get
+			 * to the level of system calls:
+			 */
+			if (chain->kernel >= 2)
+				ip = chain->ips[chain->kernel-2];
+		}
 	}
 
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
@@ -1343,6 +1353,8 @@ static const struct option options[] = {
 		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
 	OPT_BOOLEAN('P', "full-paths", &full_paths,
 		    "Don't shorten the pathnames taking into account the cwd"),
+	OPT_BOOLEAN('S', "syscalls", &collapse_syscalls,
+		    "show per syscall summary overhead, using call graph"),
 	OPT_END()
 };
 

From 90c8f954534ba15e4542ab00dd9f0e58b071518c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Jun 2009 21:36:52 +1000
Subject: [PATCH 21/49] perf_counter: powerpc: Fix two compile warnings

This fixes a couple of compile warnings that crept into the powerpc
perf_counter code recently:

   CC      arch/powerpc/kernel/perf_counter.o
 arch/powerpc/kernel/perf_counter.c: In function 'record_and_restart':
 arch/powerpc/kernel/perf_counter.c:1016: warning: unused variable 'addr'
 arch/powerpc/kernel/perf_counter.c: In function 'hw_perf_counter_init':
 arch/powerpc/kernel/perf_counter.c:891: warning: 'ev' may be used uninitialized in this function

Stephen Rothwell reported this against linux-next as well.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18998.12884.787039.22202@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bb202388170e..e6dc1850191c 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -913,6 +913,8 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	case PERF_TYPE_RAW:
 		ev = counter->attr.config;
 		break;
+	default:
+		return ERR_PTR(-EINVAL);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
@@ -1013,7 +1015,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	u64 period = counter->hw.sample_period;
 	s64 prev, delta, left;
 	int record = 0;
-	u64 addr, mmcra, sdsync;
+	u64 mmcra, sdsync;
 
 	/* we don't have to worry about interrupts here */
 	prev = atomic64_read(&counter->hw.prev_count);

From 9974458e2f9a11dbd2f4bd14fab5a79af4907b41 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Jun 2009 21:45:16 +1000
Subject: [PATCH 22/49] perf_counter: Make set_perf_counter_pending()
 declaration common

At present, every architecture that supports perf_counters has to
declare set_perf_counter_pending() in its arch-specific headers.
This consolidates the declarations into a single declaration in one
common place, include/linux/perf_counter.h.  On powerpc, we continue
to provide a static inline definition of set_perf_counter_pending()
in the powerpc hw_irq.h.

Also, this removes from the x86 perf_counter.h the unused null
definitions of {test,clear}_perf_counter_pending.

Reported-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: benh@kernel.crashing.org
LKML-Reference: <18998.13388.920691.523227@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/hw_irq.h       | 1 -
 arch/powerpc/include/asm/perf_counter.h | 2 ++
 arch/x86/include/asm/perf_counter.h     | 5 -----
 include/linux/perf_counter.h            | 1 +
 4 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 53512374e1c9..1974cf191b03 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -163,7 +163,6 @@ static inline unsigned long test_perf_counter_pending(void)
 	return 0;
 }
 
-static inline void set_perf_counter_pending(void) {}
 static inline void clear_perf_counter_pending(void) {}
 #endif /* CONFIG_PERF_COUNTERS */
 
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index cc7c887705b8..b398a84edced 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -10,6 +10,8 @@
  */
 #include <linux/types.h>
 
+#include <asm/hw_irq.h>
+
 #define MAX_HWCOUNTERS		8
 #define MAX_EVENT_ALTERNATIVES	8
 #define MAX_LIMITED_HWCOUNTERS	2
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 876ed97147b3..5fb33e160ea0 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,11 +84,6 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-extern void set_perf_counter_pending(void);
-
-#define clear_perf_counter_pending()	do { } while (0)
-#define test_perf_counter_pending()	(0)
-
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(void);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1b3118a1023a..eccae437fe37 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -604,6 +604,7 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_free_task(struct task_struct *task);
+extern void set_perf_counter_pending(void);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void __perf_disable(void);

From e2eae0f5605b90a0838608043c21050b08b6dd95 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Jun 2009 16:15:19 +0200
Subject: [PATCH 23/49] perf report: Fix 32-bit printf format

Yong Wang reported the following compiler warning:

 builtin-report.c: In function 'process_overflow_event':
 builtin-report.c:984: error: cast to pointer from integer of different size

Which happens because we try to print ->ips[] out with a limited
format, losing the high 32 bits. Print it out using %016Lx instead.

Reported-by: Yong Wang <yong.y.wang@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1e2f5dde312c..f86bb07c0e84 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -982,7 +982,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 				chain->nr);
 
 			for (i = 0; i < chain->nr; i++)
-				dprintf("..... %2d: %p\n", i, (void *)chain->ips[i]);
+				dprintf("..... %2d: %016Lx\n", i, chain->ips[i]);
 		}
 		if (collapse_syscalls) {
 			/*

From 0990b1c65729012a63e0eeca93aaaafea4e9a064 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Jun 2009 16:46:05 +0200
Subject: [PATCH 24/49] x86: Add NMI types for kmap_atomic, fix

I just realized this has a kmap_atomic bug in...

The below would fix it - but it's complicating this code
some more.

Alternatively I would have to introduce something like
pte_offset_map_irq() which would make the irq/nmi detection and leave
the regular code paths alone, however that would mean either duplicating
the gup_fast() pagewalk or passing down a pte function pointer, which
would only duplicate the gup_pte_range() bit, neither is really
attractive ...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Nick Piggin <npiggin@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/kmap_types.h | 11 ++++++-----
 arch/x86/include/asm/pgtable_32.h |  5 ++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
index ff00a44b7d0d..f86613846198 100644
--- a/arch/x86/include/asm/kmap_types.h
+++ b/arch/x86/include/asm/kmap_types.h
@@ -19,11 +19,12 @@ D(7)	KM_PTE0,
 D(8)	KM_PTE1,
 D(9)	KM_IRQ0,
 D(10)	KM_IRQ1,
-D(11)	KM_SOFTIRQ0,
-D(12)	KM_SOFTIRQ1,
-D(13)	KM_NMI,
-D(14)	KM_NMI_PTE,
-D(15)	KM_TYPE_NR
+D(11)	KM_IRQ_PTE,
+D(12)	KM_SOFTIRQ0,
+D(13)	KM_SOFTIRQ1,
+D(14)	KM_NMI,
+D(15)	KM_NMI_PTE,
+D(16)	KM_TYPE_NR
 };
 
 #undef D
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 85464971bca0..01fd9461d323 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -49,7 +49,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
 #endif
 
 #if defined(CONFIG_HIGHPTE)
-#define __KM_PTE	(in_nmi() ? KM_NMI_PTE : KM_PTE0)
+#define __KM_PTE			\
+	(in_nmi() ? KM_NMI_PTE : 	\
+	 in_irq() ? KM_IRQ_PTE :	\
+	 KM_PTE0)
 #define pte_offset_map(dir, address)					\
 	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) +		\
 	 pte_index((address)))

From 6e7d6fdcbeefa9434653b5e5da12909636ea1d52 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 17 Jun 2009 15:51:44 +0200
Subject: [PATCH 25/49] perf report: Add --sort <call> --call <$regex>

Implement sorting by callchain symbols, --sort <call>.

It will create a new column which will show a match to
--call $regex or "[unmatched]".

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 209 +++++++++++++++++++++++++++---------
 1 file changed, 158 insertions(+), 51 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f86bb07c0e84..cd74b2e58adb 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -40,11 +40,13 @@ static int		dump_trace = 0;
 
 static int		verbose;
 static int		full_paths;
-static int		collapse_syscalls;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
+static char		*call = "^sys_";
+static regex_t		call_regex;
+
 struct ip_chain_event {
 	__u16 nr;
 	__u16 hv;
@@ -463,6 +465,7 @@ struct hist_entry {
 	struct map	 *map;
 	struct dso	 *dso;
 	struct symbol	 *sym;
+	struct symbol	 *call;
 	__u64		 ip;
 	char		 level;
 
@@ -483,6 +486,16 @@ struct sort_entry {
 	size_t	(*print)(FILE *fp, struct hist_entry *);
 };
 
+static int64_t cmp_null(void *l, void *r)
+{
+	if (!l && !r)
+		return 0;
+	else if (!l)
+		return -1;
+	else
+		return 1;
+}
+
 /* --sort pid */
 
 static int64_t
@@ -517,14 +530,8 @@ sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
 	char *comm_l = left->thread->comm;
 	char *comm_r = right->thread->comm;
 
-	if (!comm_l || !comm_r) {
-		if (!comm_l && !comm_r)
-			return 0;
-		else if (!comm_l)
-			return -1;
-		else
-			return 1;
-	}
+	if (!comm_l || !comm_r)
+		return cmp_null(comm_l, comm_r);
 
 	return strcmp(comm_l, comm_r);
 }
@@ -550,14 +557,8 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 	struct dso *dso_l = left->dso;
 	struct dso *dso_r = right->dso;
 
-	if (!dso_l || !dso_r) {
-		if (!dso_l && !dso_r)
-			return 0;
-		else if (!dso_l)
-			return -1;
-		else
-			return 1;
-	}
+	if (!dso_l || !dso_r)
+		return cmp_null(dso_l, dso_r);
 
 	return strcmp(dso_l->name, dso_r->name);
 }
@@ -617,7 +618,38 @@ static struct sort_entry sort_sym = {
 	.print	= sort__sym_print,
 };
 
+/* --sort call */
+
+static int64_t
+sort__call_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct symbol *sym_l = left->call;
+	struct symbol *sym_r = right->call;
+
+	if (!sym_l || !sym_r)
+		return cmp_null(sym_l, sym_r);
+
+	return strcmp(sym_l->name, sym_r->name);
+}
+
+static size_t
+sort__call_print(FILE *fp, struct hist_entry *self)
+{
+	size_t ret = 0;
+
+	ret += fprintf(fp, "%-20s", self->call ? self->call->name : "[unmatched]");
+
+	return ret;
+}
+
+static struct sort_entry sort_call = {
+	.header = "Callchain symbol    ",
+	.cmp	= sort__call_cmp,
+	.print	= sort__call_print,
+};
+
 static int sort__need_collapse = 0;
+static int sort__has_call = 0;
 
 struct sort_dimension {
 	char			*name;
@@ -630,6 +662,7 @@ static struct sort_dimension sort_dimensions[] = {
 	{ .name = "comm",	.entry = &sort_comm,	},
 	{ .name = "dso",	.entry = &sort_dso,	},
 	{ .name = "symbol",	.entry = &sort_sym,	},
+	{ .name = "call",	.entry = &sort_call,	},
 };
 
 static LIST_HEAD(hist_entry__sort_list);
@@ -650,6 +683,18 @@ static int sort_dimension__add(char *tok)
 		if (sd->entry->collapse)
 			sort__need_collapse = 1;
 
+		if (sd->entry == &sort_call) {
+			int ret = regcomp(&call_regex, call, REG_EXTENDED);
+			if (ret) {
+				char err[BUFSIZ];
+
+				regerror(ret, &call_regex, err, sizeof(err));
+				fprintf(stderr, "Invalid regex: %s\n%s", call, err);
+				exit(-1);
+			}
+			sort__has_call = 1;
+		}
+
 		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
 		sd->taken = 1;
 
@@ -730,13 +775,76 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples)
 	return ret;
 }
 
+/*
+ *
+ */
+
+static struct symbol *
+resolve_symbol(struct thread *thread, struct map **mapp,
+	       struct dso **dsop, __u64 *ipp)
+{
+	struct dso *dso = dsop ? *dsop : NULL;
+	struct map *map = mapp ? *mapp : NULL;
+	uint64_t ip = *ipp;
+
+	if (!thread)
+		return NULL;
+
+	if (dso)
+		goto got_dso;
+
+	if (map)
+		goto got_map;
+
+	map = thread__find_map(thread, ip);
+	if (map != NULL) {
+		if (mapp)
+			*mapp = map;
+got_map:
+		ip = map->map_ip(map, ip);
+		*ipp  = ip;
+
+		dso = map->dso;
+	} else {
+		/*
+		 * If this is outside of all known maps,
+		 * and is a negative address, try to look it
+		 * up in the kernel dso, as it might be a
+		 * vsyscall (which executes in user-mode):
+		 */
+		if ((long long)ip < 0)
+		dso = kernel_dso;
+	}
+	dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
+
+	if (dsop)
+		*dsop = dso;
+
+	if (!dso)
+		return NULL;
+got_dso:
+	return dso->find_symbol(dso, ip);
+}
+
+static struct symbol *call__match(struct symbol *sym)
+{
+	if (!sym)
+		return NULL;
+
+	if (sym->name && !regexec(&call_regex, sym->name, 0, NULL, 0))
+		return sym;
+
+	return NULL;
+}
+
 /*
  * collect histogram counts
  */
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, __u64 ip, char level, __u64 count)
+		struct symbol *sym, __u64 ip, struct ip_chain_event *chain,
+	       	char level, __u64 count)
 {
 	struct rb_node **p = &hist.rb_node;
 	struct rb_node *parent = NULL;
@@ -752,6 +860,33 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 	};
 	int cmp;
 
+	if (sort__has_call && chain) {
+		int i, nr = chain->hv;
+		struct symbol *sym;
+		struct dso *dso;
+		__u64 ip;
+
+		for (i = 0; i < chain->kernel; i++) {
+			ip = chain->ips[nr + i];
+			dso = kernel_dso;
+			sym = resolve_symbol(thread, NULL, &dso, &ip);
+			entry.call = call__match(sym);
+			if (entry.call)
+				goto got_call;
+		}
+		nr += i;
+
+		for (i = 0; i < chain->user; i++) {
+			ip = chain->ips[nr + i];
+			sym = resolve_symbol(thread, NULL, NULL, &ip);
+			entry.call = call__match(sym);
+			if (entry.call)
+				goto got_call;
+		}
+		nr += i;
+	}
+got_call:
+
 	while (*p != NULL) {
 		parent = *p;
 		he = rb_entry(parent, struct hist_entry, rb_node);
@@ -955,7 +1090,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	__u64 period = 1;
 	struct map *map = NULL;
 	void *more_data = event->ip.__more_data;
-	struct ip_chain_event *chain;
+	struct ip_chain_event *chain = NULL;
 
 	if (event->header.type & PERF_SAMPLE_PERIOD) {
 		period = *(__u64 *)more_data;
@@ -984,15 +1119,6 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 			for (i = 0; i < chain->nr; i++)
 				dprintf("..... %2d: %016Lx\n", i, chain->ips[i]);
 		}
-		if (collapse_syscalls) {
-			/*
-			 * Find the all-but-last kernel entry
-			 * amongst the call-chains - to get
-			 * to the level of system calls:
-			 */
-			if (chain->kernel >= 2)
-				ip = chain->ips[chain->kernel-2];
-		}
 	}
 
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
@@ -1016,22 +1142,6 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 		show = SHOW_USER;
 		level = '.';
 
-		map = thread__find_map(thread, ip);
-		if (map != NULL) {
-			ip = map->map_ip(map, ip);
-			dso = map->dso;
-		} else {
-			/*
-			 * If this is outside of all known maps,
-			 * and is a negative address, try to look it
-			 * up in the kernel dso, as it might be a
-			 * vsyscall (which executes in user-mode):
-			 */
-			if ((long long)ip < 0)
-				dso = kernel_dso;
-		}
-		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
-
 	} else {
 		show = SHOW_HV;
 		level = 'H';
@@ -1039,12 +1149,9 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	}
 
 	if (show & show_mask) {
-		struct symbol *sym = NULL;
+		struct symbol *sym = resolve_symbol(thread, &map, &dso, &ip);
 
-		if (dso)
-			sym = dso->find_symbol(dso, ip);
-
-		if (hist_entry__add(thread, map, dso, sym, ip, level, period)) {
+		if (hist_entry__add(thread, map, dso, sym, ip, chain, level, period)) {
 			fprintf(stderr,
 		"problem incrementing symbol count, skipping event\n");
 			return -1;
@@ -1353,8 +1460,8 @@ static const struct option options[] = {
 		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
 	OPT_BOOLEAN('P', "full-paths", &full_paths,
 		    "Don't shorten the pathnames taking into account the cwd"),
-	OPT_BOOLEAN('S', "syscalls", &collapse_syscalls,
-		    "show per syscall summary overhead, using call graph"),
+	OPT_STRING('c', "call", &call, "regex",
+		   "regex to use for --sort call"),
 	OPT_END()
 };
 

From 60f916dee612130c9977a8edd4abee98334202ba Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Jun 2009 19:00:20 +0200
Subject: [PATCH 26/49] perf_counter: x86: Set the period in the intel overflow
 handler

Commit 9e350de37ac960 ("perf_counter: Accurate period data")
missed a spot, which caused all Intel-PMU samples to have a
period of 0.

This broke auto-freq sampling.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e8c68a5091df..ce1ae3f1f86c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1224,6 +1224,8 @@ again:
 		if (!intel_pmu_save_and_restart(counter))
 			continue;
 
+		data.period = counter->hw.last_period;
+
 		if (perf_counter_overflow(counter, 1, &data))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}

From 5aa75a0fd4bc6402899e06fdb853cab024d65055 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Jun 2009 20:11:41 +0200
Subject: [PATCH 27/49] perf_counter tools: Replace isprint() with issane()

The Git utils came with a ctype replacement that doesn't provide
isprint(). Add a replacement.

Solves a build bug on certain distros.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 2 +-
 tools/perf/util/util.h      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index cd74b2e58adb..707f60ce32fd 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1266,7 +1266,7 @@ static void trace_event(event_t *event)
 			for (j = 0; j < 15-(i & 15); j++)
 				cdprintf("   ");
 			for (j = 0; j < (i & 15); j++) {
-				if (isprint(raw_event[i-15+j]))
+				if (issane(raw_event[i-15+j]))
 					cdprintf("%c", raw_event[i-15+j]);
 				else
 					cdprintf(".");
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 76590a16c271..ce9b514f60a3 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -343,6 +343,7 @@ extern unsigned char sane_ctype[256];
 #define isdigit(x) sane_istest(x,GIT_DIGIT)
 #define isalpha(x) sane_istest(x,GIT_ALPHA)
 #define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
+#define issane(x)  sane_istest(x,GIT_SPACE | GIT_DIGIT | GIT_ALPHA | GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
 #define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
 #define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
 #define tolower(x) sane_case((unsigned char)(x), 0x20)

From b25bcf2f133b1e6216c3d40be394756107d3880f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 18 Jun 2009 07:01:03 +0200
Subject: [PATCH 28/49] perf report: Tidy up the "--parent <regex>" and "--sort
 parent" call-chain features

Instead of the ambigious 'call' naming use the much more
specific 'parent' naming:

 - rename --call <regex> to --parent <regex>

 - rename --sort call to --sort parent

 - rename [unmatched] to [other] - to signal that this is not
   an error but the inverse set

Also add pagefaults to the default parent-symbol pattern too,
as it's a 'syscall overhead category' in a sense.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 67 +++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 707f60ce32fd..986834623b43 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -44,8 +44,8 @@ static int		full_paths;
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
-static char		*call = "^sys_";
-static regex_t		call_regex;
+static char		*parent_pattern = "^sys_|^do_page_fault";
+static regex_t		parent_regex;
 
 struct ip_chain_event {
 	__u16 nr;
@@ -465,7 +465,7 @@ struct hist_entry {
 	struct map	 *map;
 	struct dso	 *dso;
 	struct symbol	 *sym;
-	struct symbol	 *call;
+	struct symbol	 *parent;
 	__u64		 ip;
 	char		 level;
 
@@ -618,13 +618,13 @@ static struct sort_entry sort_sym = {
 	.print	= sort__sym_print,
 };
 
-/* --sort call */
+/* --sort parent */
 
 static int64_t
-sort__call_cmp(struct hist_entry *left, struct hist_entry *right)
+sort__parent_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	struct symbol *sym_l = left->call;
-	struct symbol *sym_r = right->call;
+	struct symbol *sym_l = left->parent;
+	struct symbol *sym_r = right->parent;
 
 	if (!sym_l || !sym_r)
 		return cmp_null(sym_l, sym_r);
@@ -633,23 +633,23 @@ sort__call_cmp(struct hist_entry *left, struct hist_entry *right)
 }
 
 static size_t
-sort__call_print(FILE *fp, struct hist_entry *self)
+sort__parent_print(FILE *fp, struct hist_entry *self)
 {
 	size_t ret = 0;
 
-	ret += fprintf(fp, "%-20s", self->call ? self->call->name : "[unmatched]");
+	ret += fprintf(fp, "%-20s", self->parent ? self->parent->name : "[other]");
 
 	return ret;
 }
 
-static struct sort_entry sort_call = {
-	.header = "Callchain symbol    ",
-	.cmp	= sort__call_cmp,
-	.print	= sort__call_print,
+static struct sort_entry sort_parent = {
+	.header = "Parent symbol       ",
+	.cmp	= sort__parent_cmp,
+	.print	= sort__parent_print,
 };
 
 static int sort__need_collapse = 0;
-static int sort__has_call = 0;
+static int sort__has_parent = 0;
 
 struct sort_dimension {
 	char			*name;
@@ -662,7 +662,7 @@ static struct sort_dimension sort_dimensions[] = {
 	{ .name = "comm",	.entry = &sort_comm,	},
 	{ .name = "dso",	.entry = &sort_dso,	},
 	{ .name = "symbol",	.entry = &sort_sym,	},
-	{ .name = "call",	.entry = &sort_call,	},
+	{ .name = "parent",	.entry = &sort_parent,	},
 };
 
 static LIST_HEAD(hist_entry__sort_list);
@@ -683,16 +683,17 @@ static int sort_dimension__add(char *tok)
 		if (sd->entry->collapse)
 			sort__need_collapse = 1;
 
-		if (sd->entry == &sort_call) {
-			int ret = regcomp(&call_regex, call, REG_EXTENDED);
+		if (sd->entry == &sort_parent) {
+			int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
 			if (ret) {
 				char err[BUFSIZ];
 
-				regerror(ret, &call_regex, err, sizeof(err));
-				fprintf(stderr, "Invalid regex: %s\n%s", call, err);
+				regerror(ret, &parent_regex, err, sizeof(err));
+				fprintf(stderr, "Invalid regex: %s\n%s",
+					parent_pattern, err);
 				exit(-1);
 			}
-			sort__has_call = 1;
+			sort__has_parent = 1;
 		}
 
 		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
@@ -831,7 +832,7 @@ static struct symbol *call__match(struct symbol *sym)
 	if (!sym)
 		return NULL;
 
-	if (sym->name && !regexec(&call_regex, sym->name, 0, NULL, 0))
+	if (sym->name && !regexec(&parent_regex, sym->name, 0, NULL, 0))
 		return sym;
 
 	return NULL;
@@ -844,7 +845,7 @@ static struct symbol *call__match(struct symbol *sym)
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		struct symbol *sym, __u64 ip, struct ip_chain_event *chain,
-	       	char level, __u64 count)
+		char level, __u64 count)
 {
 	struct rb_node **p = &hist.rb_node;
 	struct rb_node *parent = NULL;
@@ -860,7 +861,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 	};
 	int cmp;
 
-	if (sort__has_call && chain) {
+	if (sort__has_parent && chain) {
 		int i, nr = chain->hv;
 		struct symbol *sym;
 		struct dso *dso;
@@ -870,22 +871,22 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 			ip = chain->ips[nr + i];
 			dso = kernel_dso;
 			sym = resolve_symbol(thread, NULL, &dso, &ip);
-			entry.call = call__match(sym);
-			if (entry.call)
-				goto got_call;
+			entry.parent = call__match(sym);
+			if (entry.parent)
+				goto got_parent;
 		}
 		nr += i;
 
 		for (i = 0; i < chain->user; i++) {
 			ip = chain->ips[nr + i];
 			sym = resolve_symbol(thread, NULL, NULL, &ip);
-			entry.call = call__match(sym);
-			if (entry.call)
-				goto got_call;
+			entry.parent = call__match(sym);
+			if (entry.parent)
+				goto got_parent;
 		}
 		nr += i;
 	}
-got_call:
+got_parent:
 
 	while (*p != NULL) {
 		parent = *p;
@@ -1457,11 +1458,11 @@ static const struct option options[] = {
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
+		   "sort by key(s): pid, comm, dso, symbol, parent"),
 	OPT_BOOLEAN('P', "full-paths", &full_paths,
 		    "Don't shorten the pathnames taking into account the cwd"),
-	OPT_STRING('c', "call", &call, "regex",
-		   "regex to use for --sort call"),
+	OPT_STRING('p', "parent", &parent_pattern, "regex",
+		   "regex filter to identify parent, see: '--sort parent'"),
 	OPT_END()
 };
 

From 7522060c95395f479ee4a6af3bbf9e097e92e48f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 18 Jun 2009 08:00:17 +0200
Subject: [PATCH 29/49] perf report: Add validation of call-chain entries

Add boundary checks for call-chain events. In case of corrupted
entries we could crash otherwise.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 20 +++++-----
 tools/perf/builtin-report.c  | 74 ++++++++++++++++++++++--------------
 2 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index eccae437fe37..a7d3a61a59b7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -337,6 +337,16 @@ enum perf_event_type {
 	 */
 };
 
+#define MAX_STACK_DEPTH			255
+
+struct perf_callchain_entry {
+	__u16				nr;
+	__u16				hv;
+	__u16				kernel;
+	__u16				user;
+	__u64				ip[MAX_STACK_DEPTH];
+};
+
 #ifdef __KERNEL__
 /*
  * Kernel-internal data types and definitions:
@@ -652,16 +662,6 @@ extern void perf_counter_fork(struct task_struct *tsk);
 
 extern void perf_counter_task_migration(struct task_struct *task, int cpu);
 
-#define MAX_STACK_DEPTH			255
-
-struct perf_callchain_entry {
-	u16				nr;
-	u16				hv;
-	u16				kernel;
-	u16				user;
-	u64				ip[MAX_STACK_DEPTH];
-};
-
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_paranoid;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 986834623b43..e14e98676171 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -39,6 +39,8 @@ static int		dump_trace = 0;
 #define cdprintf(x...)	do { if (dump_trace) color_fprintf(stdout, color, x); } while (0)
 
 static int		verbose;
+#define eprintf(x...)	do { if (verbose) fprintf(stderr, x); } while (0)
+
 static int		full_paths;
 
 static unsigned long	page_size;
@@ -47,14 +49,6 @@ static unsigned long	mmap_window = 32;
 static char		*parent_pattern = "^sys_|^do_page_fault";
 static regex_t		parent_regex;
 
-struct ip_chain_event {
-	__u16 nr;
-	__u16 hv;
-	__u16 kernel;
-	__u16 user;
-	__u64 ips[];
-};
-
 struct ip_event {
 	struct perf_event_header header;
 	__u64 ip;
@@ -131,15 +125,11 @@ static struct dso *dsos__findnew(const char *name)
 
 	nr = dso__load(dso, NULL, verbose);
 	if (nr < 0) {
-		if (verbose)
-			fprintf(stderr, "Failed to open: %s\n", name);
+		eprintf("Failed to open: %s\n", name);
 		goto out_delete_dso;
 	}
-	if (!nr && verbose) {
-		fprintf(stderr,
-		"No symbols found in: %s, maybe install a debug package?\n",
-				name);
-	}
+	if (!nr)
+		eprintf("No symbols found in: %s, maybe install a debug package?\n", name);
 
 	dsos__add(dso);
 
@@ -844,7 +834,7 @@ static struct symbol *call__match(struct symbol *sym)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, __u64 ip, struct ip_chain_event *chain,
+		struct symbol *sym, __u64 ip, struct perf_callchain_entry *chain,
 		char level, __u64 count)
 {
 	struct rb_node **p = &hist.rb_node;
@@ -868,7 +858,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		__u64 ip;
 
 		for (i = 0; i < chain->kernel; i++) {
-			ip = chain->ips[nr + i];
+			ip = chain->ip[nr + i];
 			dso = kernel_dso;
 			sym = resolve_symbol(thread, NULL, &dso, &ip);
 			entry.parent = call__match(sym);
@@ -878,7 +868,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		nr += i;
 
 		for (i = 0; i < chain->user; i++) {
-			ip = chain->ips[nr + i];
+			ip = chain->ip[nr + i];
 			sym = resolve_symbol(thread, NULL, NULL, &ip);
 			entry.parent = call__match(sym);
 			if (entry.parent)
@@ -1080,6 +1070,30 @@ static unsigned long total = 0,
 		     total_fork = 0,
 		     total_unknown = 0;
 
+static int validate_chain(struct perf_callchain_entry *chain, event_t *event)
+{
+	unsigned int chain_size;
+
+	if (chain->nr > MAX_STACK_DEPTH)
+		return -1;
+	if (chain->hv > MAX_STACK_DEPTH)
+		return -1;
+	if (chain->kernel > MAX_STACK_DEPTH)
+		return -1;
+	if (chain->user > MAX_STACK_DEPTH)
+		return -1;
+	if (chain->hv + chain->kernel + chain->user != chain->nr)
+		return -1;
+
+	chain_size = event->header.size;
+	chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event;
+
+	if (chain->nr*sizeof(__u64) > chain_size)
+		return -1;
+
+	return 0;
+}
+
 static int
 process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -1091,7 +1105,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	__u64 period = 1;
 	struct map *map = NULL;
 	void *more_data = event->ip.__more_data;
-	struct ip_chain_event *chain = NULL;
+	struct perf_callchain_entry *chain = NULL;
 
 	if (event->header.type & PERF_SAMPLE_PERIOD) {
 		period = *(__u64 *)more_data;
@@ -1111,21 +1125,26 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 
 		chain = (void *)more_data;
 
-		if (dump_trace) {
-			dprintf("... chain: u:%d, k:%d, nr:%d\n",
-				chain->user,
-				chain->kernel,
-				chain->nr);
+		dprintf("... chain: u:%d, k:%d, nr:%d\n",
+			chain->user,
+			chain->kernel,
+			chain->nr);
 
+		if (validate_chain(chain, event) < 0) {
+			eprintf("call-chain problem with event, skipping it.\n");
+			return 0;
+		}
+
+		if (dump_trace) {
 			for (i = 0; i < chain->nr; i++)
-				dprintf("..... %2d: %016Lx\n", i, chain->ips[i]);
+				dprintf("..... %2d: %016Lx\n", i, chain->ip[i]);
 		}
 	}
 
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
 	if (thread == NULL) {
-		fprintf(stderr, "problem processing %d event, skipping it.\n",
+		eprintf("problem processing %d event, skipping it.\n",
 			event->header.type);
 		return -1;
 	}
@@ -1153,8 +1172,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 		struct symbol *sym = resolve_symbol(thread, &map, &dso, &ip);
 
 		if (hist_entry__add(thread, map, dso, sym, ip, chain, level, period)) {
-			fprintf(stderr,
-		"problem incrementing symbol count, skipping event\n");
+			eprintf("problem incrementing symbol count, skipping event\n");
 			return -1;
 		}
 	}

From a73c7d84a1975b44c0ebd03c2dec288af1426349 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 18 Jun 2009 09:44:20 +0200
Subject: [PATCH 30/49] perf_counter tools: Add and use isprint()

Introduce isprint() to print out raw event dumps to ASCII, etc.

(This is an extension to upstream Git's ctype.c.)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
[ removed openssl.h inclusion from util.h - it leaked ctype.h ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c |  2 +-
 tools/perf/util/ctype.c     | 17 +++++++++++------
 tools/perf/util/util.h      | 19 ++++++++-----------
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index e14e98676171..9a3805f0c9f2 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1285,7 +1285,7 @@ static void trace_event(event_t *event)
 			for (j = 0; j < 15-(i & 15); j++)
 				cdprintf("   ");
 			for (j = 0; j < (i & 15); j++) {
-				if (issane(raw_event[i-15+j]))
+				if (isprint(raw_event[i-15+j]))
 					cdprintf("%c", raw_event[i-15+j]);
 				else
 					cdprintf(".");
diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c
index b90ec004f29c..0b791bd346bc 100644
--- a/tools/perf/util/ctype.c
+++ b/tools/perf/util/ctype.c
@@ -11,16 +11,21 @@ enum {
 	D = GIT_DIGIT,
 	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */
 	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | * */
+	P = GIT_PRINT_EXTRA,	/* printable - alpha - digit - glob - regex */
+
+	PS = GIT_SPACE | GIT_PRINT_EXTRA,
 };
 
 unsigned char sane_ctype[256] = {
+/*	0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F			    */
+
 	0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0,		/*   0.. 15 */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		/*  16.. 31 */
-	S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0,		/*  32.. 47 */
-	D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G,		/*  48.. 63 */
-	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
-	A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0,		/*  80.. 95 */
-	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
-	A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0,		/* 112..127 */
+	PS,P, P, P, R, P, P, P, R, R, G, R, P, P, R, P,		/*  32.. 47 */
+	D, D, D, D, D, D, D, D, D, D, P, P, P, P, P, G,		/*  48.. 63 */
+	P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
+	A, A, A, A, A, A, A, A, A, A, A, G, G, P, R, P,		/*  80.. 95 */
+	P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
+	A, A, A, A, A, A, A, A, A, A, A, R, R, P, P, 0,		/* 112..127 */
 	/* Nothing in the 128.. range */
 };
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index ce9b514f60a3..b8cfed776d81 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -100,11 +100,6 @@
 #include <iconv.h>
 #endif
 
-#ifndef NO_OPENSSL
-#include <openssl/ssl.h>
-#include <openssl/err.h>
-#endif
-
 /* On most systems <limits.h> would have given us this, but
  * not on some systems (e.g. GNU/Hurd).
  */
@@ -332,18 +327,20 @@ static inline int has_extension(const char *filename, const char *ext)
 #undef tolower
 #undef toupper
 extern unsigned char sane_ctype[256];
-#define GIT_SPACE 0x01
-#define GIT_DIGIT 0x02
-#define GIT_ALPHA 0x04
-#define GIT_GLOB_SPECIAL 0x08
-#define GIT_REGEX_SPECIAL 0x10
+#define GIT_SPACE		0x01
+#define GIT_DIGIT		0x02
+#define GIT_ALPHA		0x04
+#define GIT_GLOB_SPECIAL	0x08
+#define GIT_REGEX_SPECIAL	0x10
+#define GIT_PRINT_EXTRA		0x20
+#define GIT_PRINT		0x3E
 #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
 #define isascii(x) (((x) & ~0x7f) == 0)
 #define isspace(x) sane_istest(x,GIT_SPACE)
 #define isdigit(x) sane_istest(x,GIT_DIGIT)
 #define isalpha(x) sane_istest(x,GIT_ALPHA)
 #define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
-#define issane(x)  sane_istest(x,GIT_SPACE | GIT_DIGIT | GIT_ALPHA | GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
+#define isprint(x) sane_istest(x,GIT_PRINT)
 #define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
 #define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
 #define tolower(x) sane_case((unsigned char)(x), 0x20)

From 105988c015943e77092a6568bc5fb7e386df6ccd Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 17 Jun 2009 21:50:04 +1000
Subject: [PATCH 31/49] perf_counter: powerpc: Enable use of software counters
 on 32-bit powerpc

This enables the perf_counter subsystem on 32-bit powerpc.  Since we
don't have any support for hardware counters on 32-bit powerpc yet,
only software counters can be used.

Besides selecting HAVE_PERF_COUNTERS for 32-bit powerpc as well as
64-bit, the main thing this does is add an implementation of
set_perf_counter_pending().  This needs to arrange for
perf_counter_do_pending() to be called when interrupts are enabled.
Rather than add code to local_irq_restore as 64-bit does, the 32-bit
set_perf_counter_pending() generates an interrupt by setting the
decrementer to 1 so that a decrementer interrupt will become pending
in 1 or 2 timebase ticks (if a decrementer interrupt isn't already
pending).  When interrupts are enabled, timer_interrupt() will be
called, and some new code in there calls perf_counter_do_pending().
We use a per-cpu array of flags to indicate whether we need to call
perf_counter_do_pending() or not.

This introduces a couple of new Kconfig symbols: PPC_HAVE_PMU_SUPPORT,
which is selected by processor families for which we have hardware PMU
support (currently only PPC64), and PPC_PERF_CTRS, which enables the
powerpc-specific perf_counter back-end.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linuxppc-dev@ozlabs.org
Cc: benh@kernel.crashing.org
LKML-Reference: <19000.55404.103840.393470@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/Kconfig                    |  1 +
 arch/powerpc/include/asm/hw_irq.h       |  5 ++++-
 arch/powerpc/include/asm/perf_counter.h | 10 ++++++++--
 arch/powerpc/kernel/Makefile            |  6 +++---
 arch/powerpc/kernel/time.c              | 25 +++++++++++++++++++++++++
 arch/powerpc/platforms/Kconfig.cputype  | 11 ++++++++++-
 6 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9fb344d5a86a..bf6cedfa05db 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -126,6 +126,7 @@ config PPC
 	select HAVE_OPROFILE
 	select HAVE_SYSCALL_WRAPPERS if PPC64
 	select GENERIC_ATOMIC64 if PPC32
+	select HAVE_PERF_COUNTERS
 
 config EARLY_PRINTK
 	bool
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 10a642df014e..867ab8ed69b3 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,6 +131,8 @@ static inline int irqs_disabled_flags(unsigned long flags)
 struct irq_chip;
 
 #ifdef CONFIG_PERF_COUNTERS
+
+#ifdef CONFIG_PPC64
 static inline unsigned long test_perf_counter_pending(void)
 {
 	unsigned long x;
@@ -154,8 +156,9 @@ static inline void clear_perf_counter_pending(void)
 		"r" (0),
 		"i" (offsetof(struct paca_struct, perf_counter_pending)));
 }
+#endif /* CONFIG_PPC64 */
 
-#else
+#else  /* CONFIG_PERF_COUNTERS */
 
 static inline unsigned long test_perf_counter_pending(void)
 {
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index b398a84edced..2c2d9f643df0 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -57,10 +57,16 @@ extern struct power_pmu *ppmu;
 
 struct pt_regs;
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
-#define perf_misc_flags(regs)	perf_misc_flags(regs)
-
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 
+/*
+ * Only override the default definitions in include/linux/perf_counter.h
+ * if we have hardware PMU support.
+ */
+#ifdef CONFIG_PPC_PERF_CTRS
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
+#endif
+
 /*
  * The power_pmu.get_constraint function returns a 64-bit value and
  * a 64-bit mask that express the constraints between this event and
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 612b0c4dc26d..c5f93f061927 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -95,9 +95,9 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o power4-pmu.o ppc970-pmu.o \
-				   power5-pmu.o power5+-pmu.o power6-pmu.o \
-				   power7-pmu.o
+obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_counter.o
+obj64-$(CONFIG_PPC_PERF_CTRS)	+= power4-pmu.o ppc970-pmu.o power5-pmu.o \
+				   power5+-pmu.o power6-pmu.o power7-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 15391c2ab013..eae4511ceeac 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -53,6 +53,7 @@
 #include <linux/posix-timers.h>
 #include <linux/irq.h>
 #include <linux/delay.h>
+#include <linux/perf_counter.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
@@ -525,6 +526,26 @@ void __init iSeries_time_init_early(void)
 }
 #endif /* CONFIG_PPC_ISERIES */
 
+#if defined(CONFIG_PERF_COUNTERS) && defined(CONFIG_PPC32)
+DEFINE_PER_CPU(u8, perf_counter_pending);
+
+void set_perf_counter_pending(void)
+{
+	get_cpu_var(perf_counter_pending) = 1;
+	set_dec(1);
+	put_cpu_var(perf_counter_pending);
+}
+
+#define test_perf_counter_pending()	__get_cpu_var(perf_counter_pending)
+#define clear_perf_counter_pending()	__get_cpu_var(perf_counter_pending) = 0
+
+#else  /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */
+
+#define test_perf_counter_pending()	0
+#define clear_perf_counter_pending()
+
+#endif /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */
+
 /*
  * For iSeries shared processors, we have to let the hypervisor
  * set the hardware decrementer.  We set a virtual decrementer
@@ -551,6 +572,10 @@ void timer_interrupt(struct pt_regs * regs)
 	set_dec(DECREMENTER_MAX);
 
 #ifdef CONFIG_PPC32
+	if (test_perf_counter_pending()) {
+		clear_perf_counter_pending();
+		perf_counter_do_pending();
+	}
 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
 		do_IRQ(regs);
 #endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index cca6b4fc719a..dd9f3ec5ee30 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,7 +1,7 @@
 config PPC64
 	bool "64-bit kernel"
 	default n
-	select HAVE_PERF_COUNTERS
+	select PPC_HAVE_PMU_SUPPORT
 	help
 	  This option selects whether a 32-bit or a 64-bit kernel
 	  will be built.
@@ -243,6 +243,15 @@ config VIRT_CPU_ACCOUNTING
 
 	  If in doubt, say Y here.
 
+config PPC_HAVE_PMU_SUPPORT
+       bool
+
+config PPC_PERF_CTRS
+       def_bool y
+       depends on PERF_COUNTERS && PPC_HAVE_PMU_SUPPORT
+       help
+         This enables the powerpc-specific perf_counter back-end.
+
 config SMP
 	depends on PPC_STD_MMU || FSL_BOOKE
 	bool "Symmetric multi-processing support"

From 448d64f8f4c147db466c549550767cc515a4d34c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 17 Jun 2009 21:51:13 +1000
Subject: [PATCH 32/49] perf_counter: powerpc: Use unsigned long for register
 and constraint values

This changes the powerpc perf_counter back-end to use unsigned long
types for hardware register values and for the value/mask pairs used
in checking whether a given set of events fit within the hardware
constraints.  This is in preparation for adding support for the PMU
on some 32-bit powerpc processors.  On 32-bit processors the hardware
registers are only 32 bits wide, and the PMU structure is generally
simpler, so 32 bits should be ample for expressing the hardware
constraints.  On 64-bit processors, unsigned long is 64 bits wide,
so using unsigned long vs. u64 (unsigned long long) makes no actual
difference.

This makes some other very minor changes: adjusting whitespace to line
things up in initialized structures, and simplifying some code in
hw_perf_disable().

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linuxppc-dev@ozlabs.org
Cc: benh@kernel.crashing.org
LKML-Reference: <19000.55473.26174.331511@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/perf_counter.h | 35 ++++++-----
 arch/powerpc/kernel/perf_counter.c      | 20 +++---
 arch/powerpc/kernel/power4-pmu.c        | 74 +++++++++++-----------
 arch/powerpc/kernel/power5+-pmu.c       | 79 ++++++++++++-----------
 arch/powerpc/kernel/power5-pmu.c        | 83 +++++++++++++------------
 arch/powerpc/kernel/power6-pmu.c        | 57 ++++++++---------
 arch/powerpc/kernel/power7-pmu.c        | 46 +++++++-------
 arch/powerpc/kernel/ppc970-pmu.c        | 47 +++++++-------
 8 files changed, 229 insertions(+), 212 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 2c2d9f643df0..2ceb0fefa93a 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -21,21 +21,22 @@
  * describe the PMU on a particular POWER-family CPU.
  */
 struct power_pmu {
-	int	n_counter;
-	int	max_alternatives;
-	u64	add_fields;
-	u64	test_adder;
-	int	(*compute_mmcr)(u64 events[], int n_ev,
-				unsigned int hwc[], u64 mmcr[]);
-	int	(*get_constraint)(u64 event, u64 *mskp, u64 *valp);
-	int	(*get_alternatives)(u64 event, unsigned int flags,
-				    u64 alt[]);
-	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
-	int	(*limited_pmc_event)(u64 event);
-	u32	flags;
-	int	n_generic;
-	int	*generic_events;
-	int	(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+	int		n_counter;
+	int		max_alternatives;
+	unsigned long	add_fields;
+	unsigned long	test_adder;
+	int		(*compute_mmcr)(u64 events[], int n_ev,
+				unsigned int hwc[], unsigned long mmcr[]);
+	int		(*get_constraint)(u64 event, unsigned long *mskp,
+				unsigned long *valp);
+	int		(*get_alternatives)(u64 event, unsigned int flags,
+				u64 alt[]);
+	void		(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
+	int		(*limited_pmc_event)(u64 event);
+	u32		flags;
+	int		n_generic;
+	int		*generic_events;
+	int		(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
 			       [PERF_COUNT_HW_CACHE_OP_MAX]
 			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
 };
@@ -68,8 +69,8 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 #endif
 
 /*
- * The power_pmu.get_constraint function returns a 64-bit value and
- * a 64-bit mask that express the constraints between this event and
+ * The power_pmu.get_constraint function returns a 32/64-bit value and
+ * a 32/64-bit mask that express the constraints between this event and
  * other events.
  *
  * The value and mask are divided up into (non-overlapping) bitfields
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index e6dc1850191c..9300638b8c26 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -29,7 +29,7 @@ struct cpu_hw_counters {
 	struct perf_counter *counter[MAX_HWCOUNTERS];
 	u64 events[MAX_HWCOUNTERS];
 	unsigned int flags[MAX_HWCOUNTERS];
-	u64 mmcr[3];
+	unsigned long mmcr[3];
 	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
 	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
 };
@@ -135,15 +135,15 @@ static void write_pmc(int idx, unsigned long val)
 static int power_check_constraints(u64 event[], unsigned int cflags[],
 				   int n_ev)
 {
-	u64 mask, value, nv;
+	unsigned long mask, value, nv;
 	u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-	u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-	u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-	u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
+	unsigned long amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	unsigned long avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	unsigned long smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
 	int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
 	int i, j;
-	u64 addf = ppmu->add_fields;
-	u64 tadd = ppmu->test_adder;
+	unsigned long addf = ppmu->add_fields;
+	unsigned long tadd = ppmu->test_adder;
 
 	if (n_ev > ppmu->n_counter)
 		return -1;
@@ -403,14 +403,12 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
 void hw_perf_disable(void)
 {
 	struct cpu_hw_counters *cpuhw;
-	unsigned long ret;
 	unsigned long flags;
 
 	local_irq_save(flags);
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 
-	ret = cpuhw->disabled;
-	if (!ret) {
+	if (!cpuhw->disabled) {
 		cpuhw->disabled = 1;
 		cpuhw->n_added = 0;
 
@@ -1013,9 +1011,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
 	u64 period = counter->hw.sample_period;
+	unsigned long mmcra, sdsync;
 	s64 prev, delta, left;
 	int record = 0;
-	u64 mmcra, sdsync;
 
 	/* we don't have to worry about interrupts here */
 	prev = atomic64_read(&counter->hw.prev_count);
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 07bd308a5fa7..81a1708f83b2 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -179,22 +179,22 @@ static short mmcr1_adder_bits[8] = {
  */
 
 static struct unitinfo {
-	u64	value, mask;
-	int	unit;
-	int	lowerbit;
+	unsigned long	value, mask;
+	int		unit;
+	int		lowerbit;
 } p4_unitinfo[16] = {
-	[PM_FPU]  = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
-	[PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+	[PM_FPU]  = { 0x44000000000000ul, 0x88000000000000ul, PM_FPU, 0 },
+	[PM_ISU1] = { 0x20080000000000ul, 0x88000000000000ul, PM_ISU1, 0 },
 	[PM_ISU1_ALT] =
-		    { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
-	[PM_IFU]  = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+		    { 0x20080000000000ul, 0x88000000000000ul, PM_ISU1, 0 },
+	[PM_IFU]  = { 0x02200000000000ul, 0x08820000000000ul, PM_IFU, 41 },
 	[PM_IFU_ALT] =
-		    { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
-	[PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
-	[PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
-	[PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
-	[PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
-	[PM_GPS]  = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
+		    { 0x02200000000000ul, 0x08820000000000ul, PM_IFU, 41 },
+	[PM_IDU0] = { 0x10100000000000ul, 0x80840000000000ul, PM_IDU0, 1 },
+	[PM_ISU2] = { 0x10140000000000ul, 0x80840000000000ul, PM_ISU2, 0 },
+	[PM_LSU0] = { 0x01400000000000ul, 0x08800000000000ul, PM_LSU0, 0 },
+	[PM_LSU1] = { 0x00000000000000ul, 0x00010000000000ul, PM_LSU1, 40 },
+	[PM_GPS]  = { 0x00000000000000ul, 0x00000000000000ul, PM_GPS, 0 }
 };
 
 static unsigned char direct_marked_event[8] = {
@@ -249,10 +249,11 @@ static int p4_marked_instr_event(u64 event)
 	return (mask >> (byte * 8 + bit)) & 1;
 }
 
-static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp)
+static int p4_get_constraint(u64 event, unsigned long *maskp,
+			     unsigned long *valp)
 {
 	int pmc, byte, unit, lower, sh;
-	u64 mask = 0, value = 0;
+	unsigned long mask = 0, value = 0;
 	int grp = -1;
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
@@ -282,14 +283,14 @@ static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp)
 		value |= p4_unitinfo[unit].value;
 		sh = p4_unitinfo[unit].lowerbit;
 		if (sh > 1)
-			value |= (u64)lower << sh;
+			value |= (unsigned long)lower << sh;
 		else if (lower != sh)
 			return -1;
 		unit = p4_unitinfo[unit].unit;
 
 		/* Set byte lane select field */
 		mask  |= 0xfULL << (28 - 4 * byte);
-		value |= (u64)unit << (28 - 4 * byte);
+		value |= (unsigned long)unit << (28 - 4 * byte);
 	}
 	if (grp == 0) {
 		/* increment PMC1/2/5/6 field */
@@ -353,9 +354,9 @@ static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 }
 
 static int p4_compute_mmcr(u64 event[], int n_ev,
-			   unsigned int hwc[], u64 mmcr[])
+			   unsigned int hwc[], unsigned long mmcr[])
 {
-	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+	unsigned long mmcr0 = 0, mmcr1 = 0, mmcra = 0;
 	unsigned int pmc, unit, byte, psel, lower;
 	unsigned int ttm, grp;
 	unsigned int pmc_inuse = 0;
@@ -429,9 +430,11 @@ static int p4_compute_mmcr(u64 event[], int n_ev,
 		return -1;
 
 	/* Set TTMxSEL fields.  Note, units 1-3 => TTM0SEL codes 0-2 */
-	mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
-	mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
-	mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
+	mmcr1 |= (unsigned long)(unituse[3] * 2 + unituse[2])
+		<< MMCR1_TTM0SEL_SH;
+	mmcr1 |= (unsigned long)(unituse[7] * 3 + unituse[6] * 2)
+		<< MMCR1_TTM1SEL_SH;
+	mmcr1 |= (unsigned long)unituse[9] << MMCR1_TTM2SEL_SH;
 
 	/* Set TTCxSEL fields. */
 	if (unitlower & 0xe)
@@ -456,7 +459,8 @@ static int p4_compute_mmcr(u64 event[], int n_ev,
 				ttm = unit - 1;		/* 2->1, 3->2 */
 			else
 				ttm = unit >> 2;
-			mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
+			mmcr1 |= (unsigned long)ttm
+				<< (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
 		}
 	}
 
@@ -519,7 +523,7 @@ static int p4_compute_mmcr(u64 event[], int n_ev,
 	return 0;
 }
 
-static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
+static void p4_disable_pmc(unsigned int pmc, unsigned long mmcr[])
 {
 	/*
 	 * Setting the PMCxSEL field to 0 disables PMC x.
@@ -584,15 +588,15 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 };
 
 struct power_pmu power4_pmu = {
-	.n_counter = 8,
-	.max_alternatives = 5,
-	.add_fields = 0x0000001100005555ull,
-	.test_adder = 0x0011083300000000ull,
-	.compute_mmcr = p4_compute_mmcr,
-	.get_constraint = p4_get_constraint,
-	.get_alternatives = p4_get_alternatives,
-	.disable_pmc = p4_disable_pmc,
-	.n_generic = ARRAY_SIZE(p4_generic_events),
-	.generic_events = p4_generic_events,
-	.cache_events = &power4_cache_events,
+	.n_counter		= 8,
+	.max_alternatives	= 5,
+	.add_fields		= 0x0000001100005555ul,
+	.test_adder		= 0x0011083300000000ul,
+	.compute_mmcr		= p4_compute_mmcr,
+	.get_constraint		= p4_get_constraint,
+	.get_alternatives	= p4_get_alternatives,
+	.disable_pmc		= p4_disable_pmc,
+	.n_generic		= ARRAY_SIZE(p4_generic_events),
+	.generic_events		= p4_generic_events,
+	.cache_events		= &power4_cache_events,
 };
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 41e5d2d958d4..aef144d503b0 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -126,20 +126,21 @@ static const int grsel_shift[8] = {
 };
 
 /* Masks and values for using events from the various units */
-static u64 unit_cons[PM_LASTUNIT+1][2] = {
-	[PM_FPU] =   { 0x3200000000ull, 0x0100000000ull },
-	[PM_ISU0] =  { 0x0200000000ull, 0x0080000000ull },
-	[PM_ISU1] =  { 0x3200000000ull, 0x3100000000ull },
-	[PM_IFU] =   { 0x3200000000ull, 0x2100000000ull },
-	[PM_IDU] =   { 0x0e00000000ull, 0x0040000000ull },
-	[PM_GRS] =   { 0x0e00000000ull, 0x0c40000000ull },
+static unsigned long unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0x3200000000ul, 0x0100000000ul },
+	[PM_ISU0] =  { 0x0200000000ul, 0x0080000000ul },
+	[PM_ISU1] =  { 0x3200000000ul, 0x3100000000ul },
+	[PM_IFU] =   { 0x3200000000ul, 0x2100000000ul },
+	[PM_IDU] =   { 0x0e00000000ul, 0x0040000000ul },
+	[PM_GRS] =   { 0x0e00000000ul, 0x0c40000000ul },
 };
 
-static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp)
+static int power5p_get_constraint(u64 event, unsigned long *maskp,
+				  unsigned long *valp)
 {
 	int pmc, byte, unit, sh;
 	int bit, fmask;
-	u64 mask = 0, value = 0;
+	unsigned long mask = 0, value = 0;
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 	if (pmc) {
@@ -171,17 +172,18 @@ static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp)
 			bit = event & 7;
 			fmask = (bit == 6)? 7: 3;
 			sh = grsel_shift[bit];
-			mask |= (u64)fmask << sh;
-			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+			mask |= (unsigned long)fmask << sh;
+			value |= (unsigned long)((event >> PM_GRS_SH) & fmask)
+				<< sh;
 		}
 		/* Set byte lane select field */
-		mask  |= 0xfULL << (24 - 4 * byte);
-		value |= (u64)unit << (24 - 4 * byte);
+		mask  |= 0xfUL << (24 - 4 * byte);
+		value |= (unsigned long)unit << (24 - 4 * byte);
 	}
 	if (pmc < 5) {
 		/* need a counter from PMC1-4 set */
-		mask  |= 0x8000000000000ull;
-		value |= 0x1000000000000ull;
+		mask  |= 0x8000000000000ul;
+		value |= 0x1000000000000ul;
 	}
 	*maskp = mask;
 	*valp = value;
@@ -452,10 +454,10 @@ static int power5p_marked_instr_event(u64 event)
 }
 
 static int power5p_compute_mmcr(u64 event[], int n_ev,
-				unsigned int hwc[], u64 mmcr[])
+				unsigned int hwc[], unsigned long mmcr[])
 {
-	u64 mmcr1 = 0;
-	u64 mmcra = 0;
+	unsigned long mmcr1 = 0;
+	unsigned long mmcra = 0;
 	unsigned int pmc, unit, byte, psel;
 	unsigned int ttm;
 	int i, isbus, bit, grsel;
@@ -517,7 +519,7 @@ static int power5p_compute_mmcr(u64 event[], int n_ev,
 			continue;
 		if (ttmuse++)
 			return -1;
-		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+		mmcr1 |= (unsigned long)i << MMCR1_TTM0SEL_SH;
 	}
 	ttmuse = 0;
 	for (; i <= PM_GRS; ++i) {
@@ -525,7 +527,7 @@ static int power5p_compute_mmcr(u64 event[], int n_ev,
 			continue;
 		if (ttmuse++)
 			return -1;
-		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+		mmcr1 |= (unsigned long)(i & 3) << MMCR1_TTM1SEL_SH;
 	}
 	if (ttmuse > 1)
 		return -1;
@@ -540,10 +542,11 @@ static int power5p_compute_mmcr(u64 event[], int n_ev,
 			unit = PM_ISU0_ALT;
 		} else if (unit == PM_LSU1 + 1) {
 			/* select lower word of LSU1 for this byte */
-			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+			mmcr1 |= 1ul << (MMCR1_TTM3SEL_SH + 3 - byte);
 		}
 		ttm = unit >> 2;
-		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+		mmcr1 |= (unsigned long)ttm
+			<< (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
 	}
 
 	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
@@ -568,7 +571,7 @@ static int power5p_compute_mmcr(u64 event[], int n_ev,
 			if (isbus && (byte & 2) &&
 			    (psel == 8 || psel == 0x10 || psel == 0x28))
 				/* add events on higher-numbered bus */
-				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+				mmcr1 |= 1ul << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
 		} else {
 			/* Instructions or run cycles on PMC5/6 */
 			--pmc;
@@ -576,7 +579,7 @@ static int power5p_compute_mmcr(u64 event[], int n_ev,
 		if (isbus && unit == PM_GRS) {
 			bit = psel & 7;
 			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
-			mmcr1 |= (u64)grsel << grsel_shift[bit];
+			mmcr1 |= (unsigned long)grsel << grsel_shift[bit];
 		}
 		if (power5p_marked_instr_event(event[i]))
 			mmcra |= MMCRA_SAMPLE_ENABLE;
@@ -599,7 +602,7 @@ static int power5p_compute_mmcr(u64 event[], int n_ev,
 	return 0;
 }
 
-static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
+static void power5p_disable_pmc(unsigned int pmc, unsigned long mmcr[])
 {
 	if (pmc <= 3)
 		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
@@ -655,17 +658,17 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 };
 
 struct power_pmu power5p_pmu = {
-	.n_counter = 6,
-	.max_alternatives = MAX_ALT,
-	.add_fields = 0x7000000000055ull,
-	.test_adder = 0x3000040000000ull,
-	.compute_mmcr = power5p_compute_mmcr,
-	.get_constraint = power5p_get_constraint,
-	.get_alternatives = power5p_get_alternatives,
-	.disable_pmc = power5p_disable_pmc,
-	.limited_pmc_event = power5p_limited_pmc_event,
-	.flags = PPMU_LIMITED_PMC5_6,
-	.n_generic = ARRAY_SIZE(power5p_generic_events),
-	.generic_events = power5p_generic_events,
-	.cache_events = &power5p_cache_events,
+	.n_counter		= 6,
+	.max_alternatives	= MAX_ALT,
+	.add_fields		= 0x7000000000055ul,
+	.test_adder		= 0x3000040000000ul,
+	.compute_mmcr		= power5p_compute_mmcr,
+	.get_constraint		= power5p_get_constraint,
+	.get_alternatives	= power5p_get_alternatives,
+	.disable_pmc		= power5p_disable_pmc,
+	.limited_pmc_event	= power5p_limited_pmc_event,
+	.flags			= PPMU_LIMITED_PMC5_6,
+	.n_generic		= ARRAY_SIZE(power5p_generic_events),
+	.generic_events		= power5p_generic_events,
+	.cache_events		= &power5p_cache_events,
 };
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 05600b66221a..8694c73bfb52 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -130,20 +130,21 @@ static const int grsel_shift[8] = {
 };
 
 /* Masks and values for using events from the various units */
-static u64 unit_cons[PM_LASTUNIT+1][2] = {
-	[PM_FPU] =   { 0xc0002000000000ull, 0x00001000000000ull },
-	[PM_ISU0] =  { 0x00002000000000ull, 0x00000800000000ull },
-	[PM_ISU1] =  { 0xc0002000000000ull, 0xc0001000000000ull },
-	[PM_IFU] =   { 0xc0002000000000ull, 0x80001000000000ull },
-	[PM_IDU] =   { 0x30002000000000ull, 0x00000400000000ull },
-	[PM_GRS] =   { 0x30002000000000ull, 0x30000400000000ull },
+static unsigned long unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0xc0002000000000ul, 0x00001000000000ul },
+	[PM_ISU0] =  { 0x00002000000000ul, 0x00000800000000ul },
+	[PM_ISU1] =  { 0xc0002000000000ul, 0xc0001000000000ul },
+	[PM_IFU] =   { 0xc0002000000000ul, 0x80001000000000ul },
+	[PM_IDU] =   { 0x30002000000000ul, 0x00000400000000ul },
+	[PM_GRS] =   { 0x30002000000000ul, 0x30000400000000ul },
 };
 
-static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
+static int power5_get_constraint(u64 event, unsigned long *maskp,
+				 unsigned long *valp)
 {
 	int pmc, byte, unit, sh;
 	int bit, fmask;
-	u64 mask = 0, value = 0;
+	unsigned long mask = 0, value = 0;
 	int grp = -1;
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
@@ -178,8 +179,9 @@ static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
 			bit = event & 7;
 			fmask = (bit == 6)? 7: 3;
 			sh = grsel_shift[bit];
-			mask |= (u64)fmask << sh;
-			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+			mask |= (unsigned long)fmask << sh;
+			value |= (unsigned long)((event >> PM_GRS_SH) & fmask)
+				<< sh;
 		}
 		/*
 		 * Bus events on bytes 0 and 2 can be counted
@@ -188,22 +190,22 @@ static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
 		if (!pmc)
 			grp = byte & 1;
 		/* Set byte lane select field */
-		mask  |= 0xfULL << (24 - 4 * byte);
-		value |= (u64)unit << (24 - 4 * byte);
+		mask  |= 0xfUL << (24 - 4 * byte);
+		value |= (unsigned long)unit << (24 - 4 * byte);
 	}
 	if (grp == 0) {
 		/* increment PMC1/2 field */
-		mask  |= 0x200000000ull;
-		value |= 0x080000000ull;
+		mask  |= 0x200000000ul;
+		value |= 0x080000000ul;
 	} else if (grp == 1) {
 		/* increment PMC3/4 field */
-		mask  |= 0x40000000ull;
-		value |= 0x10000000ull;
+		mask  |= 0x40000000ul;
+		value |= 0x10000000ul;
 	}
 	if (pmc < 5) {
 		/* need a counter from PMC1-4 set */
-		mask  |= 0x8000000000000ull;
-		value |= 0x1000000000000ull;
+		mask  |= 0x8000000000000ul;
+		value |= 0x1000000000000ul;
 	}
 	*maskp = mask;
 	*valp = value;
@@ -383,10 +385,10 @@ static int power5_marked_instr_event(u64 event)
 }
 
 static int power5_compute_mmcr(u64 event[], int n_ev,
-			       unsigned int hwc[], u64 mmcr[])
+			       unsigned int hwc[], unsigned long mmcr[])
 {
-	u64 mmcr1 = 0;
-	u64 mmcra = 0;
+	unsigned long mmcr1 = 0;
+	unsigned long mmcra = 0;
 	unsigned int pmc, unit, byte, psel;
 	unsigned int ttm, grp;
 	int i, isbus, bit, grsel;
@@ -457,7 +459,7 @@ static int power5_compute_mmcr(u64 event[], int n_ev,
 			continue;
 		if (ttmuse++)
 			return -1;
-		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+		mmcr1 |= (unsigned long)i << MMCR1_TTM0SEL_SH;
 	}
 	ttmuse = 0;
 	for (; i <= PM_GRS; ++i) {
@@ -465,7 +467,7 @@ static int power5_compute_mmcr(u64 event[], int n_ev,
 			continue;
 		if (ttmuse++)
 			return -1;
-		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+		mmcr1 |= (unsigned long)(i & 3) << MMCR1_TTM1SEL_SH;
 	}
 	if (ttmuse > 1)
 		return -1;
@@ -480,10 +482,11 @@ static int power5_compute_mmcr(u64 event[], int n_ev,
 			unit = PM_ISU0_ALT;
 		} else if (unit == PM_LSU1 + 1) {
 			/* select lower word of LSU1 for this byte */
-			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+			mmcr1 |= 1ul << (MMCR1_TTM3SEL_SH + 3 - byte);
 		}
 		ttm = unit >> 2;
-		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+		mmcr1 |= (unsigned long)ttm
+			<< (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
 	}
 
 	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
@@ -513,7 +516,7 @@ static int power5_compute_mmcr(u64 event[], int n_ev,
 			--pmc;
 			if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
 				/* add events on higher-numbered bus */
-				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+				mmcr1 |= 1ul << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
 		} else {
 			/* Instructions or run cycles on PMC5/6 */
 			--pmc;
@@ -521,7 +524,7 @@ static int power5_compute_mmcr(u64 event[], int n_ev,
 		if (isbus && unit == PM_GRS) {
 			bit = psel & 7;
 			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
-			mmcr1 |= (u64)grsel << grsel_shift[bit];
+			mmcr1 |= (unsigned long)grsel << grsel_shift[bit];
 		}
 		if (power5_marked_instr_event(event[i]))
 			mmcra |= MMCRA_SAMPLE_ENABLE;
@@ -541,7 +544,7 @@ static int power5_compute_mmcr(u64 event[], int n_ev,
 	return 0;
 }
 
-static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
+static void power5_disable_pmc(unsigned int pmc, unsigned long mmcr[])
 {
 	if (pmc <= 3)
 		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
@@ -597,15 +600,15 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 };
 
 struct power_pmu power5_pmu = {
-	.n_counter = 6,
-	.max_alternatives = MAX_ALT,
-	.add_fields = 0x7000090000555ull,
-	.test_adder = 0x3000490000000ull,
-	.compute_mmcr = power5_compute_mmcr,
-	.get_constraint = power5_get_constraint,
-	.get_alternatives = power5_get_alternatives,
-	.disable_pmc = power5_disable_pmc,
-	.n_generic = ARRAY_SIZE(power5_generic_events),
-	.generic_events = power5_generic_events,
-	.cache_events = &power5_cache_events,
+	.n_counter		= 6,
+	.max_alternatives	= MAX_ALT,
+	.add_fields		= 0x7000090000555ul,
+	.test_adder		= 0x3000490000000ul,
+	.compute_mmcr		= power5_compute_mmcr,
+	.get_constraint		= power5_get_constraint,
+	.get_alternatives	= power5_get_alternatives,
+	.disable_pmc		= power5_disable_pmc,
+	.n_generic		= ARRAY_SIZE(power5_generic_events),
+	.generic_events		= power5_generic_events,
+	.cache_events		= &power5_cache_events,
 };
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 46f74bebcfd9..8898622ac28c 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -41,9 +41,9 @@
 #define MMCR1_NESTSEL_SH	45
 #define MMCR1_NESTSEL_MSK	0x7
 #define MMCR1_NESTSEL(m)	(((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
-#define MMCR1_PMC1_LLA		((u64)1 << 44)
-#define MMCR1_PMC1_LLA_VALUE	((u64)1 << 39)
-#define MMCR1_PMC1_ADDR_SEL	((u64)1 << 35)
+#define MMCR1_PMC1_LLA		(1ul << 44)
+#define MMCR1_PMC1_LLA_VALUE	(1ul << 39)
+#define MMCR1_PMC1_ADDR_SEL	(1ul << 35)
 #define MMCR1_PMC1SEL_SH	24
 #define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
 #define MMCR1_PMCSEL_MSK	0xff
@@ -173,10 +173,10 @@ static int power6_marked_instr_event(u64 event)
  * Assign PMC numbers and compute MMCR1 value for a set of events
  */
 static int p6_compute_mmcr(u64 event[], int n_ev,
-			   unsigned int hwc[], u64 mmcr[])
+			   unsigned int hwc[], unsigned long mmcr[])
 {
-	u64 mmcr1 = 0;
-	u64 mmcra = 0;
+	unsigned long mmcr1 = 0;
+	unsigned long mmcra = 0;
 	int i;
 	unsigned int pmc, ev, b, u, s, psel;
 	unsigned int ttmset = 0;
@@ -215,7 +215,7 @@ static int p6_compute_mmcr(u64 event[], int n_ev,
 			/* check for conflict on this byte of event bus */
 			if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
 				return -1;
-			mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
+			mmcr1 |= (unsigned long)u << MMCR1_TTMSEL_SH(b);
 			ttmset |= 1 << b;
 			if (u == 5) {
 				/* Nest events have a further mux */
@@ -224,7 +224,7 @@ static int p6_compute_mmcr(u64 event[], int n_ev,
 				    MMCR1_NESTSEL(mmcr1) != s)
 					return -1;
 				ttmset |= 0x10;
-				mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
+				mmcr1 |= (unsigned long)s << MMCR1_NESTSEL_SH;
 			}
 			if (0x30 <= psel && psel <= 0x3d) {
 				/* these need the PMCx_ADDR_SEL bits */
@@ -243,7 +243,7 @@ static int p6_compute_mmcr(u64 event[], int n_ev,
 		if (power6_marked_instr_event(event[i]))
 			mmcra |= MMCRA_SAMPLE_ENABLE;
 		if (pmc < 4)
-			mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
+			mmcr1 |= (unsigned long)psel << MMCR1_PMCSEL_SH(pmc);
 	}
 	mmcr[0] = 0;
 	if (pmc_inuse & 1)
@@ -265,10 +265,11 @@ static int p6_compute_mmcr(u64 event[], int n_ev,
  *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
  *	32-34	select field: nest (subunit) event selector
  */
-static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp)
+static int p6_get_constraint(u64 event, unsigned long *maskp,
+			     unsigned long *valp)
 {
 	int pmc, byte, sh, subunit;
-	u64 mask = 0, value = 0;
+	unsigned long mask = 0, value = 0;
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 	if (pmc) {
@@ -282,11 +283,11 @@ static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp)
 		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
 		sh = byte * 4 + (16 - PM_UNIT_SH);
 		mask |= PM_UNIT_MSKS << sh;
-		value |= (u64)(event & PM_UNIT_MSKS) << sh;
+		value |= (unsigned long)(event & PM_UNIT_MSKS) << sh;
 		if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
 			subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
-			mask  |= (u64)PM_SUBUNIT_MSK << 32;
-			value |= (u64)subunit << 32;
+			mask  |= (unsigned long)PM_SUBUNIT_MSK << 32;
+			value |= (unsigned long)subunit << 32;
 		}
 	}
 	if (pmc <= 4) {
@@ -458,7 +459,7 @@ static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 	return nalt;
 }
 
-static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
+static void p6_disable_pmc(unsigned int pmc, unsigned long mmcr[])
 {
 	/* Set PMCxSEL to 0 to disable PMCx */
 	if (pmc <= 3)
@@ -516,17 +517,17 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 };
 
 struct power_pmu power6_pmu = {
-	.n_counter = 6,
-	.max_alternatives = MAX_ALT,
-	.add_fields = 0x1555,
-	.test_adder = 0x3000,
-	.compute_mmcr = p6_compute_mmcr,
-	.get_constraint = p6_get_constraint,
-	.get_alternatives = p6_get_alternatives,
-	.disable_pmc = p6_disable_pmc,
-	.limited_pmc_event = p6_limited_pmc_event,
-	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
-	.n_generic = ARRAY_SIZE(power6_generic_events),
-	.generic_events = power6_generic_events,
-	.cache_events = &power6_cache_events,
+	.n_counter		= 6,
+	.max_alternatives	= MAX_ALT,
+	.add_fields		= 0x1555,
+	.test_adder		= 0x3000,
+	.compute_mmcr		= p6_compute_mmcr,
+	.get_constraint		= p6_get_constraint,
+	.get_alternatives	= p6_get_alternatives,
+	.disable_pmc		= p6_disable_pmc,
+	.limited_pmc_event	= p6_limited_pmc_event,
+	.flags			= PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
+	.n_generic		= ARRAY_SIZE(power6_generic_events),
+	.generic_events		= power6_generic_events,
+	.cache_events		= &power6_cache_events,
 };
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index b72e7a19d054..658d1ae436a0 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -71,10 +71,11 @@
  *     0-9: Count of events needing PMC1..PMC5
  */
 
-static int power7_get_constraint(u64 event, u64 *maskp, u64 *valp)
+static int power7_get_constraint(u64 event, unsigned long *maskp,
+				 unsigned long *valp)
 {
 	int pmc, sh;
-	u64 mask = 0, value = 0;
+	unsigned long mask = 0, value = 0;
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 	if (pmc) {
@@ -224,10 +225,10 @@ static int power7_marked_instr_event(u64 event)
 }
 
 static int power7_compute_mmcr(u64 event[], int n_ev,
-			       unsigned int hwc[], u64 mmcr[])
+			       unsigned int hwc[], unsigned long mmcr[])
 {
-	u64 mmcr1 = 0;
-	u64 mmcra = 0;
+	unsigned long mmcr1 = 0;
+	unsigned long mmcra = 0;
 	unsigned int pmc, unit, combine, l2sel, psel;
 	unsigned int pmc_inuse = 0;
 	int i;
@@ -265,11 +266,14 @@ static int power7_compute_mmcr(u64 event[], int n_ev,
 			--pmc;
 		}
 		if (pmc <= 3) {
-			mmcr1 |= (u64) unit << (MMCR1_TTM0SEL_SH - 4 * pmc);
-			mmcr1 |= (u64) combine << (MMCR1_PMC1_COMBINE_SH - pmc);
+			mmcr1 |= (unsigned long) unit
+				<< (MMCR1_TTM0SEL_SH - 4 * pmc);
+			mmcr1 |= (unsigned long) combine
+				<< (MMCR1_PMC1_COMBINE_SH - pmc);
 			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
 			if (unit == 6)	/* L2 events */
-				mmcr1 |= (u64) l2sel << MMCR1_L2SEL_SH;
+				mmcr1 |= (unsigned long) l2sel
+					<< MMCR1_L2SEL_SH;
 		}
 		if (power7_marked_instr_event(event[i]))
 			mmcra |= MMCRA_SAMPLE_ENABLE;
@@ -287,10 +291,10 @@ static int power7_compute_mmcr(u64 event[], int n_ev,
 	return 0;
 }
 
-static void power7_disable_pmc(unsigned int pmc, u64 mmcr[])
+static void power7_disable_pmc(unsigned int pmc, unsigned long mmcr[])
 {
 	if (pmc <= 3)
-		mmcr[1] &= ~(0xffULL << MMCR1_PMCSEL_SH(pmc));
+		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
 }
 
 static int power7_generic_events[] = {
@@ -343,15 +347,15 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 };
 
 struct power_pmu power7_pmu = {
-	.n_counter = 6,
-	.max_alternatives = MAX_ALT + 1,
-	.add_fields = 0x1555ull,
-	.test_adder = 0x3000ull,
-	.compute_mmcr = power7_compute_mmcr,
-	.get_constraint = power7_get_constraint,
-	.get_alternatives = power7_get_alternatives,
-	.disable_pmc = power7_disable_pmc,
-	.n_generic = ARRAY_SIZE(power7_generic_events),
-	.generic_events = power7_generic_events,
-	.cache_events = &power7_cache_events,
+	.n_counter		= 6,
+	.max_alternatives	= MAX_ALT + 1,
+	.add_fields		= 0x1555ul,
+	.test_adder		= 0x3000ul,
+	.compute_mmcr		= power7_compute_mmcr,
+	.get_constraint		= power7_get_constraint,
+	.get_alternatives	= power7_get_alternatives,
+	.disable_pmc		= power7_disable_pmc,
+	.n_generic		= ARRAY_SIZE(power7_generic_events),
+	.generic_events		= power7_generic_events,
+	.cache_events		= &power7_cache_events,
 };
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index ba0a357a89f4..3ed88333412f 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -183,7 +183,7 @@ static int p970_marked_instr_event(u64 event)
 }
 
 /* Masks and values for using events from the various units */
-static u64 unit_cons[PM_LASTUNIT+1][2] = {
+static unsigned long unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_FPU] =   { 0xc80000000000ull, 0x040000000000ull },
 	[PM_VPU] =   { 0xc80000000000ull, 0xc40000000000ull },
 	[PM_ISU] =   { 0x080000000000ull, 0x020000000000ull },
@@ -192,10 +192,11 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_STS] =   { 0x380000000000ull, 0x310000000000ull },
 };
 
-static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
+static int p970_get_constraint(u64 event, unsigned long *maskp,
+			       unsigned long *valp)
 {
 	int pmc, byte, unit, sh, spcsel;
-	u64 mask = 0, value = 0;
+	unsigned long mask = 0, value = 0;
 	int grp = -1;
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
@@ -222,7 +223,7 @@ static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
 			grp = byte & 1;
 		/* Set byte lane select field */
 		mask  |= 0xfULL << (28 - 4 * byte);
-		value |= (u64)unit << (28 - 4 * byte);
+		value |= (unsigned long)unit << (28 - 4 * byte);
 	}
 	if (grp == 0) {
 		/* increment PMC1/2/5/6 field */
@@ -236,7 +237,7 @@ static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
 	spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
 	if (spcsel) {
 		mask  |= 3ull << 48;
-		value |= (u64)spcsel << 48;
+		value |= (unsigned long)spcsel << 48;
 	}
 	*maskp = mask;
 	*valp = value;
@@ -257,9 +258,9 @@ static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 }
 
 static int p970_compute_mmcr(u64 event[], int n_ev,
-			     unsigned int hwc[], u64 mmcr[])
+			     unsigned int hwc[], unsigned long mmcr[])
 {
-	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+	unsigned long mmcr0 = 0, mmcr1 = 0, mmcra = 0;
 	unsigned int pmc, unit, byte, psel;
 	unsigned int ttm, grp;
 	unsigned int pmc_inuse = 0;
@@ -320,7 +321,7 @@ static int p970_compute_mmcr(u64 event[], int n_ev,
 			continue;
 		ttm = unitmap[i];
 		++ttmuse[(ttm >> 2) & 1];
-		mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
+		mmcr1 |= (unsigned long)(ttm & ~4) << MMCR1_TTM1SEL_SH;
 	}
 	/* Check only one unit per TTMx */
 	if (ttmuse[0] > 1 || ttmuse[1] > 1)
@@ -340,7 +341,8 @@ static int p970_compute_mmcr(u64 event[], int n_ev,
 			if (unit == PM_LSU1L && byte >= 2)
 				mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
 		}
-		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+		mmcr1 |= (unsigned long)ttm
+			<< (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
 	}
 
 	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
@@ -386,7 +388,8 @@ static int p970_compute_mmcr(u64 event[], int n_ev,
 	for (pmc = 0; pmc < 2; ++pmc)
 		mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
 	for (; pmc < 8; ++pmc)
-		mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+		mmcr1 |= (unsigned long)pmcsel[pmc]
+			<< (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
 	if (pmc_inuse & 1)
 		mmcr0 |= MMCR0_PMC1CE;
 	if (pmc_inuse & 0xfe)
@@ -401,7 +404,7 @@ static int p970_compute_mmcr(u64 event[], int n_ev,
 	return 0;
 }
 
-static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
+static void p970_disable_pmc(unsigned int pmc, unsigned long mmcr[])
 {
 	int shift, i;
 
@@ -468,15 +471,15 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 };
 
 struct power_pmu ppc970_pmu = {
-	.n_counter = 8,
-	.max_alternatives = 2,
-	.add_fields = 0x001100005555ull,
-	.test_adder = 0x013300000000ull,
-	.compute_mmcr = p970_compute_mmcr,
-	.get_constraint = p970_get_constraint,
-	.get_alternatives = p970_get_alternatives,
-	.disable_pmc = p970_disable_pmc,
-	.n_generic = ARRAY_SIZE(ppc970_generic_events),
-	.generic_events = ppc970_generic_events,
-	.cache_events = &ppc970_cache_events,
+	.n_counter		= 8,
+	.max_alternatives	= 2,
+	.add_fields		= 0x001100005555ull,
+	.test_adder		= 0x013300000000ull,
+	.compute_mmcr		= p970_compute_mmcr,
+	.get_constraint		= p970_get_constraint,
+	.get_alternatives	= p970_get_alternatives,
+	.disable_pmc		= p970_disable_pmc,
+	.n_generic		= ARRAY_SIZE(ppc970_generic_events),
+	.generic_events		= ppc970_generic_events,
+	.cache_events		= &ppc970_cache_events,
 };

From 079b3c569c87819e7a19d9b9f51d4746fc47bf9a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 17 Jun 2009 21:52:09 +1000
Subject: [PATCH 33/49] perf_counter: powerpc: Change how processor-specific
 back-ends get selected

At present, the powerpc generic (processor-independent) perf_counter
code has list of processor back-end modules, and at initialization,
it looks at the PVR (processor version register) and has a switch
statement to select a suitable processor-specific back-end.

This is going to become inconvenient as we add more processor-specific
back-ends, so this inverts the order: now each back-end checks whether
it applies to the current processor, and registers itself if so.
Furthermore, instead of looking at the PVR, back-ends now check the
cur_cpu_spec->oprofile_cpu_type string and match on that.

Lastly, each back-end now specifies a name for itself so the core can
print a nice message when a back-end registers itself.

This doesn't provide any support for unregistering back-ends, but that
wouldn't be hard to do and would allow back-ends to be modules.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linuxppc-dev@ozlabs.org
Cc: benh@kernel.crashing.org
LKML-Reference: <19000.55529.762227.518531@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/perf_counter.h |  5 +--
 arch/powerpc/kernel/perf_counter.c      | 42 ++++---------------------
 arch/powerpc/kernel/power4-pmu.c        | 15 ++++++++-
 arch/powerpc/kernel/power5+-pmu.c       | 16 +++++++++-
 arch/powerpc/kernel/power5-pmu.c        | 15 ++++++++-
 arch/powerpc/kernel/power6-pmu.c        | 15 ++++++++-
 arch/powerpc/kernel/power7-pmu.c        | 15 ++++++++-
 arch/powerpc/kernel/ppc970-pmu.c        | 16 +++++++++-
 8 files changed, 95 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 2ceb0fefa93a..8ccd4e155768 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -21,6 +21,7 @@
  * describe the PMU on a particular POWER-family CPU.
  */
 struct power_pmu {
+	const char	*name;
 	int		n_counter;
 	int		max_alternatives;
 	unsigned long	add_fields;
@@ -41,8 +42,6 @@ struct power_pmu {
 			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
 };
 
-extern struct power_pmu *ppmu;
-
 /*
  * Values for power_pmu.flags
  */
@@ -56,6 +55,8 @@ extern struct power_pmu *ppmu;
 #define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
 #define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
 
+extern int register_power_pmu(struct power_pmu *);
+
 struct pt_regs;
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 9300638b8c26..25e656c14945 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1214,42 +1214,14 @@ void hw_perf_counter_setup(int cpu)
 	cpuhw->mmcr[0] = MMCR0_FC;
 }
 
-extern struct power_pmu power4_pmu;
-extern struct power_pmu ppc970_pmu;
-extern struct power_pmu power5_pmu;
-extern struct power_pmu power5p_pmu;
-extern struct power_pmu power6_pmu;
-extern struct power_pmu power7_pmu;
-
-static int init_perf_counters(void)
+int register_power_pmu(struct power_pmu *pmu)
 {
-	unsigned long pvr;
+	if (ppmu)
+		return -EBUSY;		/* something's already registered */
 
-	/* XXX should get this from cputable */
-	pvr = mfspr(SPRN_PVR);
-	switch (PVR_VER(pvr)) {
-	case PV_POWER4:
-	case PV_POWER4p:
-		ppmu = &power4_pmu;
-		break;
-	case PV_970:
-	case PV_970FX:
-	case PV_970MP:
-		ppmu = &ppc970_pmu;
-		break;
-	case PV_POWER5:
-		ppmu = &power5_pmu;
-		break;
-	case PV_POWER5p:
-		ppmu = &power5p_pmu;
-		break;
-	case 0x3e:
-		ppmu = &power6_pmu;
-		break;
-	case 0x3f:
-		ppmu = &power7_pmu;
-		break;
-	}
+	ppmu = pmu;
+	pr_info("%s performance monitor hardware support registered\n",
+		pmu->name);
 
 	/*
 	 * Use FCHV to ignore kernel events if MSR.HV is set.
@@ -1259,5 +1231,3 @@ static int init_perf_counters(void)
 
 	return 0;
 }
-
-arch_initcall(init_perf_counters);
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 81a1708f83b2..db90b0c5c27b 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -10,7 +10,9 @@
  */
 #include <linux/kernel.h>
 #include <linux/perf_counter.h>
+#include <linux/string.h>
 #include <asm/reg.h>
+#include <asm/cputable.h>
 
 /*
  * Bits in event code for POWER4
@@ -587,7 +589,8 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	},
 };
 
-struct power_pmu power4_pmu = {
+static struct power_pmu power4_pmu = {
+	.name			= "POWER4/4+",
 	.n_counter		= 8,
 	.max_alternatives	= 5,
 	.add_fields		= 0x0000001100005555ul,
@@ -600,3 +603,13 @@ struct power_pmu power4_pmu = {
 	.generic_events		= p4_generic_events,
 	.cache_events		= &power4_cache_events,
 };
+
+static int init_power4_pmu(void)
+{
+	if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power4"))
+		return -ENODEV;
+
+	return register_power_pmu(&power4_pmu);
+}
+
+arch_initcall(init_power4_pmu);
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index aef144d503b0..f4adca8e98a4 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -10,7 +10,9 @@
  */
 #include <linux/kernel.h>
 #include <linux/perf_counter.h>
+#include <linux/string.h>
 #include <asm/reg.h>
+#include <asm/cputable.h>
 
 /*
  * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
@@ -657,7 +659,8 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	},
 };
 
-struct power_pmu power5p_pmu = {
+static struct power_pmu power5p_pmu = {
+	.name			= "POWER5+/++",
 	.n_counter		= 6,
 	.max_alternatives	= MAX_ALT,
 	.add_fields		= 0x7000000000055ul,
@@ -672,3 +675,14 @@ struct power_pmu power5p_pmu = {
 	.generic_events		= power5p_generic_events,
 	.cache_events		= &power5p_cache_events,
 };
+
+static int init_power5p_pmu(void)
+{
+	if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5+")
+	    && strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5++"))
+		return -ENODEV;
+
+	return register_power_pmu(&power5p_pmu);
+}
+
+arch_initcall(init_power5p_pmu);
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 8694c73bfb52..29b2c6c0e83a 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -10,7 +10,9 @@
  */
 #include <linux/kernel.h>
 #include <linux/perf_counter.h>
+#include <linux/string.h>
 #include <asm/reg.h>
+#include <asm/cputable.h>
 
 /*
  * Bits in event code for POWER5 (not POWER5++)
@@ -599,7 +601,8 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	},
 };
 
-struct power_pmu power5_pmu = {
+static struct power_pmu power5_pmu = {
+	.name			= "POWER5",
 	.n_counter		= 6,
 	.max_alternatives	= MAX_ALT,
 	.add_fields		= 0x7000090000555ul,
@@ -612,3 +615,13 @@ struct power_pmu power5_pmu = {
 	.generic_events		= power5_generic_events,
 	.cache_events		= &power5_cache_events,
 };
+
+static int init_power5_pmu(void)
+{
+	if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5"))
+		return -ENODEV;
+
+	return register_power_pmu(&power5_pmu);
+}
+
+arch_initcall(init_power5_pmu);
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 8898622ac28c..09ae5bf5bda7 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -10,7 +10,9 @@
  */
 #include <linux/kernel.h>
 #include <linux/perf_counter.h>
+#include <linux/string.h>
 #include <asm/reg.h>
+#include <asm/cputable.h>
 
 /*
  * Bits in event code for POWER6
@@ -516,7 +518,8 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	},
 };
 
-struct power_pmu power6_pmu = {
+static struct power_pmu power6_pmu = {
+	.name			= "POWER6",
 	.n_counter		= 6,
 	.max_alternatives	= MAX_ALT,
 	.add_fields		= 0x1555,
@@ -531,3 +534,13 @@ struct power_pmu power6_pmu = {
 	.generic_events		= power6_generic_events,
 	.cache_events		= &power6_cache_events,
 };
+
+static int init_power6_pmu(void)
+{
+	if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power6"))
+		return -ENODEV;
+
+	return register_power_pmu(&power6_pmu);
+}
+
+arch_initcall(init_power6_pmu);
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index 658d1ae436a0..5d755ef7ac8f 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -10,7 +10,9 @@
  */
 #include <linux/kernel.h>
 #include <linux/perf_counter.h>
+#include <linux/string.h>
 #include <asm/reg.h>
+#include <asm/cputable.h>
 
 /*
  * Bits in event code for POWER7
@@ -346,7 +348,8 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	},
 };
 
-struct power_pmu power7_pmu = {
+static struct power_pmu power7_pmu = {
+	.name			= "POWER7",
 	.n_counter		= 6,
 	.max_alternatives	= MAX_ALT + 1,
 	.add_fields		= 0x1555ul,
@@ -359,3 +362,13 @@ struct power_pmu power7_pmu = {
 	.generic_events		= power7_generic_events,
 	.cache_events		= &power7_cache_events,
 };
+
+static int init_power7_pmu(void)
+{
+	if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7"))
+		return -ENODEV;
+
+	return register_power_pmu(&power7_pmu);
+}
+
+arch_initcall(init_power7_pmu);
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 3ed88333412f..6637c87fe70e 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -10,7 +10,9 @@
  */
 #include <linux/string.h>
 #include <linux/perf_counter.h>
+#include <linux/string.h>
 #include <asm/reg.h>
+#include <asm/cputable.h>
 
 /*
  * Bits in event code for PPC970
@@ -470,7 +472,8 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 	},
 };
 
-struct power_pmu ppc970_pmu = {
+static struct power_pmu ppc970_pmu = {
+	.name			= "PPC970/FX/MP",
 	.n_counter		= 8,
 	.max_alternatives	= 2,
 	.add_fields		= 0x001100005555ull,
@@ -483,3 +486,14 @@ struct power_pmu ppc970_pmu = {
 	.generic_events		= ppc970_generic_events,
 	.cache_events		= &ppc970_cache_events,
 };
+
+static int init_ppc970_pmu(void)
+{
+	if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970")
+	    && strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970MP"))
+		return -ENODEV;
+
+	return register_power_pmu(&ppc970_pmu);
+}
+
+arch_initcall(init_ppc970_pmu);

From 98fb1807b97e3e631b940f67544e265c64b984dc Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 17 Jun 2009 21:53:10 +1000
Subject: [PATCH 34/49] perf_counter: powerpc: Make powerpc perf_counter code
 safe for 32-bit kernels

This abstracts a few things in arch/powerpc/kernel/perf_counter.c
that are specific to 64-bit kernels, and provides definitions for
32-bit kernels.  In particular,

* Only 64-bit has MMCRA and the bits in it that give information
  about a PMU interrupt (sampled PR, HV, slot number etc.)
* Only 64-bit has the lppaca and the lppaca->pmcregs_in_use field
* Use of SDAR is confined to 64-bit for now
* Only 64-bit has soft/lazy interrupt disable and therefore
  pseudo-NMIs (interrupts that occur while interrupts are soft-disabled)
* Only 64-bit has PMC7 and PMC8
* Only 64-bit has the MSR_HV bit.

This also fixes the types used in a couple of places, where we were
using long types for things that need to be 64-bit.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linuxppc-dev@ozlabs.org
Cc: benh@kernel.crashing.org
LKML-Reference: <19000.55590.634126.876084@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 193 ++++++++++++++++++++---------
 1 file changed, 133 insertions(+), 60 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 25e656c14945..809fdf94b95f 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -46,6 +46,115 @@ struct power_pmu *ppmu;
  */
 static unsigned int freeze_counters_kernel = MMCR0_FCS;
 
+/*
+ * 32-bit doesn't have MMCRA but does have an MMCR2,
+ * and a few other names are different.
+ */
+#ifdef CONFIG_PPC32
+
+#define MMCR0_FCHV		0
+#define MMCR0_PMCjCE		MMCR0_PMCnCE
+
+#define SPRN_MMCRA		SPRN_MMCR2
+#define MMCRA_SAMPLE_ENABLE	0
+
+static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void perf_set_pmu_inuse(int inuse) { }
+static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
+static inline u32 perf_get_misc_flags(struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void perf_read_regs(struct pt_regs *regs) { }
+static inline int perf_intr_is_nmi(struct pt_regs *regs)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PPC32 */
+
+/*
+ * Things that are specific to 64-bit implementations.
+ */
+#ifdef CONFIG_PPC64
+
+static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
+{
+	unsigned long mmcra = regs->dsisr;
+
+	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+		unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+		if (slot > 1)
+			return 4 * (slot - 1);
+	}
+	return 0;
+}
+
+static inline void perf_set_pmu_inuse(int inuse)
+{
+	get_lppaca()->pmcregs_in_use = inuse;
+}
+
+/*
+ * The user wants a data address recorded.
+ * If we're not doing instruction sampling, give them the SDAR
+ * (sampled data address).  If we are doing instruction sampling, then
+ * only give them the SDAR if it corresponds to the instruction
+ * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC
+ * bit in MMCRA.
+ */
+static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
+{
+	unsigned long mmcra = regs->dsisr;
+	unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+		POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+
+	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+		*addrp = mfspr(SPRN_SDAR);
+}
+
+static inline u32 perf_get_misc_flags(struct pt_regs *regs)
+{
+	unsigned long mmcra = regs->dsisr;
+
+	if (TRAP(regs) != 0xf00)
+		return 0;	/* not a PMU interrupt */
+
+	if (ppmu->flags & PPMU_ALT_SIPR) {
+		if (mmcra & POWER6_MMCRA_SIHV)
+			return PERF_EVENT_MISC_HYPERVISOR;
+		return (mmcra & POWER6_MMCRA_SIPR) ?
+			PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
+	}
+	if (mmcra & MMCRA_SIHV)
+		return PERF_EVENT_MISC_HYPERVISOR;
+	return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+		PERF_EVENT_MISC_KERNEL;
+}
+
+/*
+ * Overload regs->dsisr to store MMCRA so we only need to read it once
+ * on each interrupt.
+ */
+static inline void perf_read_regs(struct pt_regs *regs)
+{
+	regs->dsisr = mfspr(SPRN_MMCRA);
+}
+
+/*
+ * If interrupts were soft-disabled when a PMU interrupt occurs, treat
+ * it as an NMI.
+ */
+static inline int perf_intr_is_nmi(struct pt_regs *regs)
+{
+	return !regs->softe;
+}
+
+#endif /* CONFIG_PPC64 */
+
 static void perf_counter_interrupt(struct pt_regs *regs);
 
 void perf_counter_print_debug(void)
@@ -78,12 +187,14 @@ static unsigned long read_pmc(int idx)
 	case 6:
 		val = mfspr(SPRN_PMC6);
 		break;
+#ifdef CONFIG_PPC64
 	case 7:
 		val = mfspr(SPRN_PMC7);
 		break;
 	case 8:
 		val = mfspr(SPRN_PMC8);
 		break;
+#endif /* CONFIG_PPC64 */
 	default:
 		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
 		val = 0;
@@ -115,12 +226,14 @@ static void write_pmc(int idx, unsigned long val)
 	case 6:
 		mtspr(SPRN_PMC6, val);
 		break;
+#ifdef CONFIG_PPC64
 	case 7:
 		mtspr(SPRN_PMC7, val);
 		break;
 	case 8:
 		mtspr(SPRN_PMC8, val);
 		break;
+#endif /* CONFIG_PPC64 */
 	default:
 		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
 	}
@@ -283,7 +396,7 @@ static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
 
 static void power_pmu_read(struct perf_counter *counter)
 {
-	long val, delta, prev;
+	s64 val, delta, prev;
 
 	if (!counter->hw.idx)
 		return;
@@ -477,7 +590,7 @@ void hw_perf_enable(void)
 		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 		if (cpuhw->n_counters == 0)
-			get_lppaca()->pmcregs_in_use = 0;
+			perf_set_pmu_inuse(0);
 		goto out_enable;
 	}
 
@@ -510,7 +623,7 @@ void hw_perf_enable(void)
 	 * bit set and set the hardware counters to their initial values.
 	 * Then unfreeze the counters.
 	 */
-	get_lppaca()->pmcregs_in_use = 1;
+	perf_set_pmu_inuse(1);
 	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
@@ -1007,11 +1120,10 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
  * things if requested.  Note that interrupts are hard-disabled
  * here so there is no possibility of being interrupted.
  */
-static void record_and_restart(struct perf_counter *counter, long val,
+static void record_and_restart(struct perf_counter *counter, unsigned long val,
 			       struct pt_regs *regs, int nmi)
 {
 	u64 period = counter->hw.sample_period;
-	unsigned long mmcra, sdsync;
 	s64 prev, delta, left;
 	int record = 0;
 
@@ -1033,8 +1145,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
 				left = period;
 			record = 1;
 		}
-		if (left < 0x80000000L)
-			val = 0x80000000L - left;
+		if (left < 0x80000000LL)
+			val = 0x80000000LL - left;
 	}
 
 	/*
@@ -1047,22 +1159,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 			.period	= counter->hw.last_period,
 		};
 
-		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
-			/*
-			 * The user wants a data address recorded.
-			 * If we're not doing instruction sampling,
-			 * give them the SDAR (sampled data address).
-			 * If we are doing instruction sampling, then only
-			 * give them the SDAR if it corresponds to the
-			 * instruction pointed to by SIAR; this is indicated
-			 * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
-			 */
-			mmcra = regs->dsisr;
-			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
-				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
-			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
-				data.addr = mfspr(SPRN_SDAR);
-		}
+		if (counter->attr.sample_type & PERF_SAMPLE_ADDR)
+			perf_get_data_addr(regs, &data.addr);
+
 		if (perf_counter_overflow(counter, nmi, &data)) {
 			/*
 			 * Interrupts are coming too fast - throttle them
@@ -1088,25 +1187,12 @@ static void record_and_restart(struct perf_counter *counter, long val,
  */
 unsigned long perf_misc_flags(struct pt_regs *regs)
 {
-	unsigned long mmcra;
+	u32 flags = perf_get_misc_flags(regs);
 
-	if (TRAP(regs) != 0xf00) {
-		/* not a PMU interrupt */
-		return user_mode(regs) ? PERF_EVENT_MISC_USER :
-			PERF_EVENT_MISC_KERNEL;
-	}
-
-	mmcra = regs->dsisr;
-	if (ppmu->flags & PPMU_ALT_SIPR) {
-		if (mmcra & POWER6_MMCRA_SIHV)
-			return PERF_EVENT_MISC_HYPERVISOR;
-		return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
-			PERF_EVENT_MISC_KERNEL;
-	}
-	if (mmcra & MMCRA_SIHV)
-		return PERF_EVENT_MISC_HYPERVISOR;
-	return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
-			PERF_EVENT_MISC_KERNEL;
+	if (flags)
+		return flags;
+	return user_mode(regs) ? PERF_EVENT_MISC_USER :
+		PERF_EVENT_MISC_KERNEL;
 }
 
 /*
@@ -1115,20 +1201,12 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
  */
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
-	unsigned long mmcra;
 	unsigned long ip;
-	unsigned long slot;
 
 	if (TRAP(regs) != 0xf00)
 		return regs->nip;	/* not a PMU interrupt */
 
-	ip = mfspr(SPRN_SIAR);
-	mmcra = regs->dsisr;
-	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
-		slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
-		if (slot > 1)
-			ip += 4 * (slot - 1);
-	}
+	ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
 	return ip;
 }
 
@@ -1140,7 +1218,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	int i;
 	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 	struct perf_counter *counter;
-	long val;
+	unsigned long val;
 	int found = 0;
 	int nmi;
 
@@ -1148,16 +1226,9 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
 					mfspr(SPRN_PMC6));
 
-	/*
-	 * Overload regs->dsisr to store MMCRA so we only need to read it once.
-	 */
-	regs->dsisr = mfspr(SPRN_MMCRA);
+	perf_read_regs(regs);
 
-	/*
-	 * If interrupts were soft-disabled when this PMU interrupt
-	 * occurred, treat it as an NMI.
-	 */
-	nmi = !regs->softe;
+	nmi = perf_intr_is_nmi(regs);
 	if (nmi)
 		nmi_enter();
 	else
@@ -1223,11 +1294,13 @@ int register_power_pmu(struct power_pmu *pmu)
 	pr_info("%s performance monitor hardware support registered\n",
 		pmu->name);
 
+#ifdef MSR_HV
 	/*
 	 * Use FCHV to ignore kernel events if MSR.HV is set.
 	 */
 	if (mfmsr() & MSR_HV)
 		freeze_counters_kernel = MMCR0_FCHV;
+#endif /* CONFIG_PPC64 */
 
 	return 0;
 }

From 7325927e5a20bfe0f006acf92801bf41c537d3d4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 17 Jun 2009 21:53:51 +1000
Subject: [PATCH 35/49] perf_counter: powerpc: Add processor back-end for
 MPC7450 family

This adds support for the performance monitor hardware on the
MPC7450 family of processors (7450, 7451, 7455, 7447/7457, 7447A,
7448), used in the later Apple G4 powermacs/powerbooks and other
machines.  These machines have 6 hardware counters with a unique
set of events which can be counted on each counter, with some
events being available on multiple counters.

Raw event codes for these processors are (PMC << 8) + PMCSEL.
If PMC is non-zero then the event is that selected by the given
PMCSEL value for that PMC (hardware counter).  If PMC is zero
then the event selected is one of the low-numbered ones that are
common to several PMCs.  In this case PMCSEL must be <= 22 and
the event is what that PMCSEL value would select on PMC1 (but
it may be placed any other PMC that has the same event for that
PMCSEL value).

For events that count cycles or occurrences that exceed a threshold,
the threshold requested can be specified in the 0x3f000 bits of the
raw event codes.  If the event uses the threshold multiplier bit
and that bit should be set, that is indicated with the 0x40000 bit
of the raw event code.

This fills in some of the generic cache events.  Unfortunately there
are quite a few blank spaces in the table, partly because these
processors tend to count cache hits rather than cache accesses.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linuxppc-dev@ozlabs.org
Cc: benh@kernel.crashing.org
LKML-Reference: <19000.55631.802122.696927@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/Makefile           |   2 +
 arch/powerpc/kernel/mpc7450-pmu.c      | 417 +++++++++++++++++++++++++
 arch/powerpc/platforms/Kconfig.cputype |   1 +
 3 files changed, 420 insertions(+)
 create mode 100644 arch/powerpc/kernel/mpc7450-pmu.c

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index c5f93f061927..a9f882963379 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_counter.o
 obj64-$(CONFIG_PPC_PERF_CTRS)	+= power4-pmu.o ppc970-pmu.o power5-pmu.o \
 				   power5+-pmu.o power6-pmu.o power7-pmu.o
+obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
@@ -106,6 +107,7 @@ obj-y				+= iomap.o
 endif
 
 obj-$(CONFIG_PPC64)		+= $(obj64-y)
+obj-$(CONFIG_PPC32)		+= $(obj32-y)
 
 ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
 obj-y				+= ppc_save_regs.o
diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c
new file mode 100644
index 000000000000..75ff47fed7bf
--- /dev/null
+++ b/arch/powerpc/kernel/mpc7450-pmu.c
@@ -0,0 +1,417 @@
+/*
+ * Performance counter support for MPC7450-family processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/string.h>
+#include <linux/perf_counter.h>
+#include <linux/string.h>
+#include <asm/reg.h>
+#include <asm/cputable.h>
+
+#define N_COUNTER	6	/* Number of hardware counters */
+#define MAX_ALT		3	/* Maximum number of event alternative codes */
+
+/*
+ * Bits in event code for MPC7450 family
+ */
+#define PM_THRMULT_MSKS	0x40000
+#define PM_THRESH_SH	12
+#define PM_THRESH_MSK	0x3f
+#define PM_PMC_SH	8
+#define PM_PMC_MSK	7
+#define PM_PMCSEL_MSK	0x7f
+
+/*
+ * Classify events according to how specific their PMC requirements are.
+ * Result is:
+ *	0: can go on any PMC
+ *	1: can go on PMCs 1-4
+ *	2: can go on PMCs 1,2,4
+ *	3: can go on PMCs 1 or 2
+ *	4: can only go on one PMC
+ *	-1: event code is invalid
+ */
+#define N_CLASSES	5
+
+static int mpc7450_classify_event(u32 event)
+{
+	int pmc;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > N_COUNTER)
+			return -1;
+		return 4;
+	}
+	event &= PM_PMCSEL_MSK;
+	if (event <= 1)
+		return 0;
+	if (event <= 7)
+		return 1;
+	if (event <= 13)
+		return 2;
+	if (event <= 22)
+		return 3;
+	return -1;
+}
+
+/*
+ * Events using threshold and possible threshold scale:
+ *	code	scale?	name
+ *	11e	N	PM_INSTQ_EXCEED_CYC
+ *	11f	N	PM_ALTV_IQ_EXCEED_CYC
+ *	128	Y	PM_DTLB_SEARCH_EXCEED_CYC
+ *	12b	Y	PM_LD_MISS_EXCEED_L1_CYC
+ *	220	N	PM_CQ_EXCEED_CYC
+ *	30c	N	PM_GPR_RB_EXCEED_CYC
+ *	30d	?	PM_FPR_IQ_EXCEED_CYC ?
+ *	311	Y	PM_ITLB_SEARCH_EXCEED
+ *	410	N	PM_GPR_IQ_EXCEED_CYC
+ */
+
+/*
+ * Return use of threshold and threshold scale bits:
+ * 0 = uses neither, 1 = uses threshold, 2 = uses both
+ */
+static int mpc7450_threshold_use(u32 event)
+{
+	int pmc, sel;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	sel = event & PM_PMCSEL_MSK;
+	switch (pmc) {
+	case 1:
+		if (sel == 0x1e || sel == 0x1f)
+			return 1;
+		if (sel == 0x28 || sel == 0x2b)
+			return 2;
+		break;
+	case 2:
+		if (sel == 0x20)
+			return 1;
+		break;
+	case 3:
+		if (sel == 0xc || sel == 0xd)
+			return 1;
+		if (sel == 0x11)
+			return 2;
+		break;
+	case 4:
+		if (sel == 0x10)
+			return 1;
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Layout of constraint bits:
+ * 33222222222211111111110000000000
+ * 10987654321098765432109876543210
+ *  |<    ><  > < > < ><><><><><><>
+ *  TS TV   G4   G3  G2P6P5P4P3P2P1
+ *
+ * P1 - P6
+ *	0 - 11: Count of events needing PMC1 .. PMC6
+ *
+ * G2
+ *	12 - 14: Count of events needing PMC1 or PMC2
+ *
+ * G3
+ *	16 - 18: Count of events needing PMC1, PMC2 or PMC4
+ *
+ * G4
+ *	20 - 23: Count of events needing PMC1, PMC2, PMC3 or PMC4
+ *
+ * TV
+ *	24 - 29: Threshold value requested
+ *
+ * TS
+ *	30: Threshold scale value requested
+ */
+
+static u32 pmcbits[N_COUNTER][2] = {
+	{ 0x00844002, 0x00111001 },	/* PMC1 mask, value: P1,G2,G3,G4 */
+	{ 0x00844008, 0x00111004 },	/* PMC2: P2,G2,G3,G4 */
+	{ 0x00800020, 0x00100010 },	/* PMC3: P3,G4 */
+	{ 0x00840080, 0x00110040 },	/* PMC4: P4,G3,G4 */
+	{ 0x00000200, 0x00000100 },	/* PMC5: P5 */
+	{ 0x00000800, 0x00000400 }	/* PMC6: P6 */
+};
+
+static u32 classbits[N_CLASSES - 1][2] = {
+	{ 0x00000000, 0x00000000 },	/* class 0: no constraint */
+	{ 0x00800000, 0x00100000 },	/* class 1: G4 */
+	{ 0x00040000, 0x00010000 },	/* class 2: G3 */
+	{ 0x00004000, 0x00001000 },	/* class 3: G2 */
+};
+
+static int mpc7450_get_constraint(u64 event, unsigned long *maskp,
+				  unsigned long *valp)
+{
+	int pmc, class;
+	u32 mask, value;
+	int thresh, tuse;
+
+	class = mpc7450_classify_event(event);
+	if (class < 0)
+		return -1;
+	if (class == 4) {
+		pmc = ((unsigned int)event >> PM_PMC_SH) & PM_PMC_MSK;
+		mask  = pmcbits[pmc - 1][0];
+		value = pmcbits[pmc - 1][1];
+	} else {
+		mask  = classbits[class][0];
+		value = classbits[class][1];
+	}
+
+	tuse = mpc7450_threshold_use(event);
+	if (tuse) {
+		thresh = ((unsigned int)event >> PM_THRESH_SH) & PM_THRESH_MSK;
+		mask  |= 0x3f << 24;
+		value |= thresh << 24;
+		if (tuse == 2) {
+			mask |= 0x40000000;
+			if ((unsigned int)event & PM_THRMULT_MSKS)
+				value |= 0x40000000;
+		}
+	}
+
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x217, 0x317 },		/* PM_L1_DCACHE_MISS */
+	{ 0x418, 0x50f, 0x60f },	/* PM_SNOOP_RETRY */
+	{ 0x502, 0x602 },		/* PM_L2_HIT */
+	{ 0x503, 0x603 },		/* PM_L3_HIT */
+	{ 0x504, 0x604 },		/* PM_L2_ICACHE_MISS */
+	{ 0x505, 0x605 },		/* PM_L3_ICACHE_MISS */
+	{ 0x506, 0x606 },		/* PM_L2_DCACHE_MISS */
+	{ 0x507, 0x607 },		/* PM_L3_DCACHE_MISS */
+	{ 0x50a, 0x623 },		/* PM_LD_HIT_L3 */
+	{ 0x50b, 0x624 },		/* PM_ST_HIT_L3 */
+	{ 0x50d, 0x60d },		/* PM_L2_TOUCH_HIT */
+	{ 0x50e, 0x60e },		/* PM_L3_TOUCH_HIT */
+	{ 0x512, 0x612 },		/* PM_INT_LOCAL */
+	{ 0x513, 0x61d },		/* PM_L2_MISS */
+	{ 0x514, 0x61e },		/* PM_L3_MISS */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(u32 event)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			break;
+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+			if (event == event_alternatives[i][j])
+				return i;
+	}
+	return -1;
+}
+
+static int mpc7450_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+	int i, j, nalt = 1;
+	u32 ae;
+
+	alt[0] = event;
+	nalt = 1;
+	i = find_alternative((u32)event);
+	if (i >= 0) {
+		for (j = 0; j < MAX_ALT; ++j) {
+			ae = event_alternatives[i][j];
+			if (ae && ae != (u32)event)
+				alt[nalt++] = ae;
+		}
+	}
+	return nalt;
+}
+
+/*
+ * Bitmaps of which PMCs each class can use for classes 0 - 3.
+ * Bit i is set if PMC i+1 is usable.
+ */
+static const u8 classmap[N_CLASSES] = {
+	0x3f, 0x0f, 0x0b, 0x03, 0
+};
+
+/* Bit position and width of each PMCSEL field */
+static const int pmcsel_shift[N_COUNTER] = {
+	6,	0,	27,	22,	17,	11
+};
+static const u32 pmcsel_mask[N_COUNTER] = {
+	0x7f,	0x3f,	0x1f,	0x1f,	0x1f,	0x3f
+};
+
+/*
+ * Compute MMCR0/1/2 values for a set of events.
+ */
+static int mpc7450_compute_mmcr(u64 event[], int n_ev,
+				unsigned int hwc[], unsigned long mmcr[])
+{
+	u8 event_index[N_CLASSES][N_COUNTER];
+	int n_classevent[N_CLASSES];
+	int i, j, class, tuse;
+	u32 pmc_inuse = 0, pmc_avail;
+	u32 mmcr0 = 0, mmcr1 = 0, mmcr2 = 0;
+	u32 ev, pmc, thresh;
+
+	if (n_ev > N_COUNTER)
+		return -1;
+
+	/* First pass: count usage in each class */
+	for (i = 0; i < N_CLASSES; ++i)
+		n_classevent[i] = 0;
+	for (i = 0; i < n_ev; ++i) {
+		class = mpc7450_classify_event(event[i]);
+		if (class < 0)
+			return -1;
+		j = n_classevent[class]++;
+		event_index[class][j] = i;
+	}
+
+	/* Second pass: allocate PMCs from most specific event to least */
+	for (class = N_CLASSES - 1; class >= 0; --class) {
+		for (i = 0; i < n_classevent[class]; ++i) {
+			ev = event[event_index[class][i]];
+			if (class == 4) {
+				pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
+				if (pmc_inuse & (1 << (pmc - 1)))
+					return -1;
+			} else {
+				/* Find a suitable PMC */
+				pmc_avail = classmap[class] & ~pmc_inuse;
+				if (!pmc_avail)
+					return -1;
+				pmc = ffs(pmc_avail);
+			}
+			pmc_inuse |= 1 << (pmc - 1);
+
+			tuse = mpc7450_threshold_use(ev);
+			if (tuse) {
+				thresh = (ev >> PM_THRESH_SH) & PM_THRESH_MSK;
+				mmcr0 |= thresh << 16;
+				if (tuse == 2 && (ev & PM_THRMULT_MSKS))
+					mmcr2 = 0x80000000;
+			}
+			ev &= pmcsel_mask[pmc - 1];
+			ev <<= pmcsel_shift[pmc - 1];
+			if (pmc <= 2)
+				mmcr0 |= ev;
+			else
+				mmcr1 |= ev;
+			hwc[event_index[class][i]] = pmc - 1;
+		}
+	}
+
+	if (pmc_inuse & 1)
+		mmcr0 |= MMCR0_PMC1CE;
+	if (pmc_inuse & 0x3e)
+		mmcr0 |= MMCR0_PMCnCE;
+
+	/* Return MMCRx values */
+	mmcr[0] = mmcr0;
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcr2;
+	return 0;
+}
+
+/*
+ * Disable counting by a PMC.
+ * Note that the pmc argument is 0-based here, not 1-based.
+ */
+static void mpc7450_disable_pmc(unsigned int pmc, unsigned long mmcr[])
+{
+	if (pmc <= 1)
+		mmcr[0] &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]);
+	else
+		mmcr[1] &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]);
+}
+
+static int mpc7450_generic_events[] = {
+	[PERF_COUNT_HW_CPU_CYCLES]		= 1,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 2,
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x217, /* PM_L1_DCACHE_MISS */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x122, /* PM_BR_CMPL */
+	[PERF_COUNT_HW_BRANCH_MISSES] 		= 0x41c, /* PM_BR_MPRED */
+};
+
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int mpc7450_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x225	},
+		[C(OP_WRITE)] = {	0,		0x227	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x129,		0x115	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	0x634,		0	},
+	},
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	0,		0	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x312	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x223	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x122,		0x41c	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+};
+
+struct power_pmu mpc7450_pmu = {
+	.name			= "MPC7450 family",
+	.n_counter		= N_COUNTER,
+	.max_alternatives	= MAX_ALT,
+	.add_fields		= 0x00111555ul,
+	.test_adder		= 0x00301000ul,
+	.compute_mmcr		= mpc7450_compute_mmcr,
+	.get_constraint		= mpc7450_get_constraint,
+	.get_alternatives	= mpc7450_get_alternatives,
+	.disable_pmc		= mpc7450_disable_pmc,
+	.n_generic		= ARRAY_SIZE(mpc7450_generic_events),
+	.generic_events		= mpc7450_generic_events,
+	.cache_events		= &mpc7450_cache_events,
+};
+
+static int init_mpc7450_pmu(void)
+{
+	if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/7450"))
+		return -ENODEV;
+
+	return register_power_pmu(&mpc7450_pmu);
+}
+
+arch_initcall(init_mpc7450_pmu);
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index dd9f3ec5ee30..8485c8ca7a06 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -75,6 +75,7 @@ config POWER4_ONLY
 config 6xx
 	def_bool y
 	depends on PPC32 && PPC_BOOK3S
+	select PPC_HAVE_PMU_SUPPORT
 
 config POWER3
 	bool

From e24a72c4d8f0b2c17783b3ba9c8931b537149423 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 17 Jun 2009 21:54:26 +1000
Subject: [PATCH 36/49] perf_counter: tools: Makefile tweaks for 64-bit powerpc

On 64-bit powerpc, perf needs to be built as a 64-bit executable.
This arranges to add the -m64 flag to CFLAGS if we are running on
a 64-bit machine, indicated by the result of uname -m ending in "64".
This means that we'll use -m64 on x86_64 machines as well.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linuxppc-dev@ozlabs.org
Cc: benh@kernel.crashing.org
LKML-Reference: <19000.55666.866148.559620@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index e8346f95fbb0..714db7327b94 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -157,9 +157,14 @@ uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
 uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
 uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
+# If we're on a 64-bit kernel, use -m64
+ifneq ($(patsubst %64,%,$(uname_M)),$(uname_M))
+  M64 := -m64
+endif
+
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
+CFLAGS = $(M64) -ggdb3 -Wall -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
 LDFLAGS = -lpthread -lrt -lelf -lm
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)

From d3a9262e59f7fb83c6d44df3b2b1460ed57d3ea1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 18 Jun 2009 12:54:00 +0200
Subject: [PATCH 37/49] fs: Provide empty .set_page_dirty() aop for anon inodes

.set_page_dirty() is one of those a_ops that defaults to the
buffer implementation when not set. Therefore provide a dummy
function to make it do nothing.

(Uncovered by perfcounters fd's which can now be writable-mmap-ed.)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/anon_inodes.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 1dd96d4406c0..47d4a01c5393 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -52,6 +52,19 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
 	.d_delete	= anon_inodefs_delete_dentry,
 };
 
+/*
+ * nop .set_page_dirty method so that people can use .page_mkwrite on
+ * anon inodes.
+ */
+static int anon_set_page_dirty(struct page *page)
+{
+	return 0;
+};
+
+static const struct address_space_operations anon_aops = {
+	.set_page_dirty = anon_set_page_dirty,
+};
+
 /**
  * anon_inode_getfd - creates a new file instance by hooking it up to an
  *                    anonymous inode, and a dentry that describe the "class"
@@ -151,6 +164,8 @@ static struct inode *anon_inode_mkinode(void)
 
 	inode->i_fop = &anon_inode_fops;
 
+	inode->i_mapping->a_ops = &anon_aops;
+
 	/*
 	 * Mark the inode dirty from the very beginning,
 	 * that way it will never be moved to the dirty

From 43a21ea81a2400992561146327c4785ce7f7be38 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 19:39:37 +0100
Subject: [PATCH 38/49] perf_counter: Add event overlow handling

Alternative method of mmap() data output handling that provides
better overflow management and a more reliable data stream.

Unlike the previous method, that didn't have any user->kernel
feedback and relied on userspace keeping up, this method relies on
userspace writing its last read position into the control page.

It will ensure new output doesn't overwrite not-yet read events,
new events for which there is no space left are lost and the
overflow counter is incremented, providing exact event loss
numbers.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  40 +++++---
 kernel/perf_counter.c        | 193 ++++++++++++++++++++++++-----------
 2 files changed, 162 insertions(+), 71 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a7d3a61a59b7..0765e8e69843 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -236,10 +236,16 @@ struct perf_counter_mmap_page {
 	/*
 	 * Control data for the mmap() data buffer.
 	 *
-	 * User-space reading this value should issue an rmb(), on SMP capable
-	 * platforms, after reading this value -- see perf_counter_wakeup().
+	 * User-space reading the @data_head value should issue an rmb(), on
+	 * SMP capable platforms, after reading this value -- see
+	 * perf_counter_wakeup().
+	 *
+	 * When the mapping is PROT_WRITE the @data_tail value should be
+	 * written by userspace to reflect the last read data. In this case
+	 * the kernel will not over-write unread data.
 	 */
 	__u64   data_head;		/* head in the data section */
+	__u64	data_tail;		/* user-space written tail */
 };
 
 #define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0)
@@ -273,6 +279,15 @@ enum perf_event_type {
 	 */
 	PERF_EVENT_MMAP			= 1,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				id;
+	 * 	u64				lost;
+	 * };
+	 */
+	PERF_EVENT_LOST			= 2,
+
 	/*
 	 * struct {
 	 *	struct perf_event_header	header;
@@ -313,26 +328,26 @@ enum perf_event_type {
 
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
-	 * will be PERF_RECORD_*
+	 * will be PERF_SAMPLE_*
 	 *
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *
-	 *	{ u64			ip;	  } && PERF_RECORD_IP
-	 *	{ u32			pid, tid; } && PERF_RECORD_TID
-	 *	{ u64			time;     } && PERF_RECORD_TIME
-	 *	{ u64			addr;     } && PERF_RECORD_ADDR
-	 *	{ u64			config;   } && PERF_RECORD_CONFIG
-	 *	{ u32			cpu, res; } && PERF_RECORD_CPU
+	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
+	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
+	 *	{ u64			time;     } && PERF_SAMPLE_TIME
+	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
+	 *	{ u64			config;   } && PERF_SAMPLE_CONFIG
+	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
 	 *
 	 *	{ u64			nr;
-	 *	  { u64 id, val; }	cnt[nr];  } && PERF_RECORD_GROUP
+	 *	  { u64 id, val; }	cnt[nr];  } && PERF_SAMPLE_GROUP
 	 *
 	 *	{ u16			nr,
 	 *				hv,
 	 *				kernel,
 	 *				user;
-	 *	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
+	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
 	 * };
 	 */
 };
@@ -424,6 +439,7 @@ struct file;
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;	/* nr of data pages  */
+	int				writable;	/* are we writable   */
 	int				nr_locked;	/* nr pages mlocked  */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
@@ -433,8 +449,8 @@ struct perf_mmap_data {
 	atomic_long_t			done_head;	/* completed head    */
 
 	atomic_t			lock;		/* concurrent writes */
-
 	atomic_t			wakeup;		/* needs a wakeup    */
+	atomic_t			lost;		/* nr records lost   */
 
 	struct perf_counter_mmap_page   *user_page;
 	void				*data_pages[0];
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 109a95723859..7e9108efd305 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1794,6 +1794,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct perf_mmap_data *data;
 	int ret = VM_FAULT_SIGBUS;
 
+	if (vmf->flags & FAULT_FLAG_MKWRITE) {
+		if (vmf->pgoff == 0)
+			ret = 0;
+		return ret;
+	}
+
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (!data)
@@ -1807,9 +1813,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		if ((unsigned)nr > data->nr_pages)
 			goto unlock;
 
+		if (vmf->flags & FAULT_FLAG_WRITE)
+			goto unlock;
+
 		vmf->page = virt_to_page(data->data_pages[nr]);
 	}
+
 	get_page(vmf->page);
+	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->index   = vmf->pgoff;
+
 	ret = 0;
 unlock:
 	rcu_read_unlock();
@@ -1862,6 +1875,14 @@ fail:
 	return -ENOMEM;
 }
 
+static void perf_mmap_free_page(unsigned long addr)
+{
+	struct page *page = virt_to_page(addr);
+
+	page->mapping = NULL;
+	__free_page(page);
+}
+
 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
 {
 	struct perf_mmap_data *data;
@@ -1869,9 +1890,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
 
 	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
 
-	free_page((unsigned long)data->user_page);
+	perf_mmap_free_page((unsigned long)data->user_page);
 	for (i = 0; i < data->nr_pages; i++)
-		free_page((unsigned long)data->data_pages[i]);
+		perf_mmap_free_page((unsigned long)data->data_pages[i]);
+
 	kfree(data);
 }
 
@@ -1908,9 +1930,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 }
 
 static struct vm_operations_struct perf_mmap_vmops = {
-	.open  = perf_mmap_open,
-	.close = perf_mmap_close,
-	.fault = perf_mmap_fault,
+	.open		= perf_mmap_open,
+	.close		= perf_mmap_close,
+	.fault		= perf_mmap_fault,
+	.page_mkwrite	= perf_mmap_fault,
 };
 
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1924,7 +1947,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	long user_extra, extra;
 	int ret = 0;
 
-	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
 	vma_size = vma->vm_end - vma->vm_start;
@@ -1983,10 +2006,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	atomic_long_add(user_extra, &user->locked_vm);
 	vma->vm_mm->locked_vm += extra;
 	counter->data->nr_locked = extra;
+	if (vma->vm_flags & VM_WRITE)
+		counter->data->writable = 1;
+
 unlock:
 	mutex_unlock(&counter->mmap_mutex);
 
-	vma->vm_flags &= ~VM_MAYWRITE;
 	vma->vm_flags |= VM_RESERVED;
 	vma->vm_ops = &perf_mmap_vmops;
 
@@ -2163,11 +2188,38 @@ struct perf_output_handle {
 	unsigned long		head;
 	unsigned long		offset;
 	int			nmi;
-	int			overflow;
+	int			sample;
 	int			locked;
 	unsigned long		flags;
 };
 
+static bool perf_output_space(struct perf_mmap_data *data,
+			      unsigned int offset, unsigned int head)
+{
+	unsigned long tail;
+	unsigned long mask;
+
+	if (!data->writable)
+		return true;
+
+	mask = (data->nr_pages << PAGE_SHIFT) - 1;
+	/*
+	 * Userspace could choose to issue a mb() before updating the tail
+	 * pointer. So that all reads will be completed before the write is
+	 * issued.
+	 */
+	tail = ACCESS_ONCE(data->user_page->data_tail);
+	smp_rmb();
+
+	offset = (offset - tail) & mask;
+	head   = (head   - tail) & mask;
+
+	if ((int)(head - offset) < 0)
+		return false;
+
+	return true;
+}
+
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
 	atomic_set(&handle->data->poll, POLL_IN);
@@ -2258,55 +2310,6 @@ out:
 	local_irq_restore(handle->flags);
 }
 
-static int perf_output_begin(struct perf_output_handle *handle,
-			     struct perf_counter *counter, unsigned int size,
-			     int nmi, int overflow)
-{
-	struct perf_mmap_data *data;
-	unsigned int offset, head;
-
-	/*
-	 * For inherited counters we send all the output towards the parent.
-	 */
-	if (counter->parent)
-		counter = counter->parent;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (!data)
-		goto out;
-
-	handle->data	 = data;
-	handle->counter	 = counter;
-	handle->nmi	 = nmi;
-	handle->overflow = overflow;
-
-	if (!data->nr_pages)
-		goto fail;
-
-	perf_output_lock(handle);
-
-	do {
-		offset = head = atomic_long_read(&data->head);
-		head += size;
-	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
-
-	handle->offset	= offset;
-	handle->head	= head;
-
-	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
-		atomic_set(&data->wakeup, 1);
-
-	return 0;
-
-fail:
-	perf_output_wakeup(handle);
-out:
-	rcu_read_unlock();
-
-	return -ENOSPC;
-}
-
 static void perf_output_copy(struct perf_output_handle *handle,
 			     const void *buf, unsigned int len)
 {
@@ -2346,6 +2349,78 @@ static void perf_output_copy(struct perf_output_handle *handle,
 #define perf_output_put(handle, x) \
 	perf_output_copy((handle), &(x), sizeof(x))
 
+static int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_counter *counter, unsigned int size,
+			     int nmi, int sample)
+{
+	struct perf_mmap_data *data;
+	unsigned int offset, head;
+	int have_lost;
+	struct {
+		struct perf_event_header header;
+		u64			 id;
+		u64			 lost;
+	} lost_event;
+
+	/*
+	 * For inherited counters we send all the output towards the parent.
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (!data)
+		goto out;
+
+	handle->data	= data;
+	handle->counter	= counter;
+	handle->nmi	= nmi;
+	handle->sample	= sample;
+
+	if (!data->nr_pages)
+		goto fail;
+
+	have_lost = atomic_read(&data->lost);
+	if (have_lost)
+		size += sizeof(lost_event);
+
+	perf_output_lock(handle);
+
+	do {
+		offset = head = atomic_long_read(&data->head);
+		head += size;
+		if (unlikely(!perf_output_space(data, offset, head)))
+			goto fail;
+	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+
+	handle->offset	= offset;
+	handle->head	= head;
+
+	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+		atomic_set(&data->wakeup, 1);
+
+	if (have_lost) {
+		lost_event.header.type = PERF_EVENT_LOST;
+		lost_event.header.misc = 0;
+		lost_event.header.size = sizeof(lost_event);
+		lost_event.id          = counter->id;
+		lost_event.lost        = atomic_xchg(&data->lost, 0);
+
+		perf_output_put(handle, lost_event);
+	}
+
+	return 0;
+
+fail:
+	atomic_inc(&data->lost);
+	perf_output_unlock(handle);
+out:
+	rcu_read_unlock();
+
+	return -ENOSPC;
+}
+
 static void perf_output_end(struct perf_output_handle *handle)
 {
 	struct perf_counter *counter = handle->counter;
@@ -2353,7 +2428,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 
 	int wakeup_events = counter->attr.wakeup_events;
 
-	if (handle->overflow && wakeup_events) {
+	if (handle->sample && wakeup_events) {
 		int events = atomic_inc_return(&data->events);
 		if (events >= wakeup_events) {
 			atomic_sub(wakeup_events, &data->events);
@@ -2958,7 +3033,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 }
 
 /*
- * Generic counter overflow handling.
+ * Generic counter overflow handling, sampling.
  */
 
 int perf_counter_overflow(struct perf_counter *counter, int nmi,

From 9d91a6f7a489eb914c16b82d927f9d81d629c259 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 18 Jun 2009 11:40:28 +0200
Subject: [PATCH 39/49] perf_counter tools: Handle lost events

Make use of the new ->data_tail mechanism to tell kernel-space
about user-space draining the data stream. Emit lost events
(and display them) if they happen.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 20 ++++++++++++++++----
 tools/perf/builtin-report.c | 29 ++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index e1dfef24887f..06fdfb8b4828 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -78,10 +78,10 @@ struct mmap_data {
 
 static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
-static unsigned int mmap_read_head(struct mmap_data *md)
+static unsigned long mmap_read_head(struct mmap_data *md)
 {
 	struct perf_counter_mmap_page *pc = md->base;
-	int head;
+	long head;
 
 	head = pc->data_head;
 	rmb();
@@ -89,6 +89,17 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 	return head;
 }
 
+static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
+{
+	struct perf_counter_mmap_page *pc = md->base;
+
+	/*
+	 * ensure all reads are done before we write the tail out.
+	 */
+	/* mb(); */
+	pc->data_tail = tail;
+}
+
 static void mmap_read(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
@@ -109,7 +120,7 @@ static void mmap_read(struct mmap_data *md)
 	 * In either case, truncate and restart at head.
 	 */
 	diff = head - old;
-	if (diff > md->mask / 2 || diff < 0) {
+	if (diff < 0) {
 		struct timeval iv;
 		unsigned long msecs;
 
@@ -167,6 +178,7 @@ static void mmap_read(struct mmap_data *md)
 	}
 
 	md->prev = old;
+	mmap_write_tail(md, old);
 }
 
 static volatile int done = 0;
@@ -424,7 +436,7 @@ try_again:
 	mmap_array[nr_cpu][counter].prev = 0;
 	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
 	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-			PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
+			PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
 	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
 		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
 		exit(-1);
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 9a3805f0c9f2..fe66895111b1 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -83,6 +83,12 @@ struct period_event {
 	__u64 sample_period;
 };
 
+struct lost_event {
+	struct perf_event_header header;
+	__u64 id;
+	__u64 lost;
+};
+
 typedef union event_union {
 	struct perf_event_header	header;
 	struct ip_event			ip;
@@ -90,6 +96,7 @@ typedef union event_union {
 	struct comm_event		comm;
 	struct fork_event		fork;
 	struct period_event		period;
+	struct lost_event		lost;
 } event_t;
 
 static LIST_HEAD(dsos);
@@ -1068,7 +1075,8 @@ static unsigned long total = 0,
 		     total_mmap = 0,
 		     total_comm = 0,
 		     total_fork = 0,
-		     total_unknown = 0;
+		     total_unknown = 0,
+		     total_lost = 0;
 
 static int validate_chain(struct perf_callchain_entry *chain, event_t *event)
 {
@@ -1260,6 +1268,20 @@ process_period_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static int
+process_lost_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	dprintf("%p [%p]: PERF_EVENT_LOST: id:%Ld: lost:%Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->lost.id,
+		event->lost.lost);
+
+	total_lost += event->lost.lost;
+
+	return 0;
+}
+
 static void trace_event(event_t *event)
 {
 	unsigned char *raw_event = (void *)event;
@@ -1316,6 +1338,10 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 
 	case PERF_EVENT_PERIOD:
 		return process_period_event(event, offset, head);
+
+	case PERF_EVENT_LOST:
+		return process_lost_event(event, offset, head);
+
 	/*
 	 * We dont process them right now but they are fine:
 	 */
@@ -1444,6 +1470,7 @@ more:
 	dprintf("    mmap events: %10ld\n", total_mmap);
 	dprintf("    comm events: %10ld\n", total_comm);
 	dprintf("    fork events: %10ld\n", total_fork);
+	dprintf("    lost events: %10ld\n", total_lost);
 	dprintf(" unknown events: %10ld\n", total_unknown);
 
 	if (dump_trace)

From b8e6d829729d1a5991a9f628205b671cac2ec06f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 18 Jun 2009 14:32:19 +0200
Subject: [PATCH 40/49] perf report: Filter to parent set by default

Make it easier to use parent filtering - default to a filtered
output. Also add the parent column so that we get collapsing but
dont display it by default.

add --no-exclude-other to override this.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Makefile         |  2 +-
 tools/perf/builtin-report.c | 30 +++++++++++++++++++++++++++---
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 714db7327b94..672c5f069c6e 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -164,7 +164,7 @@ endif
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = $(M64) -ggdb3 -Wall -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
+CFLAGS = $(M64) -ggdb3 -Wall -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -Werror -O6
 LDFLAGS = -lpthread -lrt -lelf -lm
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index fe66895111b1..86981bd08f65 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -46,9 +46,12 @@ static int		full_paths;
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
-static char		*parent_pattern = "^sys_|^do_page_fault";
+static char		default_parent_pattern[] = "^sys_|^do_page_fault";
+static char		*parent_pattern = default_parent_pattern;
 static regex_t		parent_regex;
 
+static int		exclude_other = 1;
+
 struct ip_event {
 	struct perf_event_header header;
 	__u64 ip;
@@ -742,6 +745,9 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples)
 	struct sort_entry *se;
 	size_t ret;
 
+	if (exclude_other && !self->parent)
+		return 0;
+
 	if (total_samples) {
 		double percent = self->count * 100.0 / total_samples;
 		char *color = PERF_COLOR_NORMAL;
@@ -764,6 +770,9 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples)
 		ret = fprintf(fp, "%12Ld ", self->count);
 
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		if (exclude_other && (se == &sort_parent))
+			continue;
+
 		fprintf(fp, "  ");
 		ret += se->print(fp, self);
 	}
@@ -855,6 +864,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		.ip	= ip,
 		.level	= level,
 		.count	= count,
+		.parent = NULL,
 	};
 	int cmp;
 
@@ -1029,14 +1039,20 @@ static size_t output__fprintf(FILE *fp, __u64 total_samples)
 	fprintf(fp, "#\n");
 
 	fprintf(fp, "# Overhead");
-	list_for_each_entry(se, &hist_entry__sort_list, list)
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		if (exclude_other && (se == &sort_parent))
+			continue;
 		fprintf(fp, "  %s", se->header);
+	}
 	fprintf(fp, "\n");
 
 	fprintf(fp, "# ........");
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		int i;
 
+		if (exclude_other && (se == &sort_parent))
+			continue;
+
 		fprintf(fp, "  ");
 		for (i = 0; i < strlen(se->header); i++)
 			fprintf(fp, ".");
@@ -1050,7 +1066,8 @@ static size_t output__fprintf(FILE *fp, __u64 total_samples)
 		ret += hist_entry__fprintf(fp, pos, total_samples);
 	}
 
-	if (!strcmp(sort_order, default_sort_order)) {
+	if (sort_order == default_sort_order &&
+			parent_pattern == default_parent_pattern) {
 		fprintf(fp, "#\n");
 		fprintf(fp, "# (For more details, try: perf report --sort comm,dso,symbol)\n");
 		fprintf(fp, "#\n");
@@ -1508,6 +1525,8 @@ static const struct option options[] = {
 		    "Don't shorten the pathnames taking into account the cwd"),
 	OPT_STRING('p', "parent", &parent_pattern, "regex",
 		   "regex filter to identify parent, see: '--sort parent'"),
+	OPT_BOOLEAN('x', "exclude-other", &exclude_other,
+		    "Only display entries with parent-match"),
 	OPT_END()
 };
 
@@ -1536,6 +1555,11 @@ int cmd_report(int argc, const char **argv, const char *prefix)
 
 	setup_sorting();
 
+	if (parent_pattern != default_parent_pattern)
+		sort_dimension__add("parent");
+	else
+		exclude_other = 0;
+
 	/*
 	 * Any (unrecognized) arguments left?
 	 */

From f9188e023c248d73f5b4a589b480e065c1864068 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 18 Jun 2009 22:20:52 +0200
Subject: [PATCH 41/49] perf_counter: Make callchain samples extensible

Before exposing upstream tools to a callchain-samples ABI, tidy it
up to make it more extensible in the future:

Use markers in the IP chain to denote context, use (u64)-1..-4095 range
for these context markers because we use them for ERR_PTR(), so these
addresses are unlikely to be mapped.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 29 ++++++-----------------------
 include/linux/perf_counter.h       | 28 +++++++++++++++++-----------
 2 files changed, 23 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ce1ae3f1f86c..76dfef23f789 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1555,9 +1555,9 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
  */
 
 static inline
-void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
+void callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < MAX_STACK_DEPTH)
+	if (entry->nr < PERF_MAX_STACK_DEPTH)
 		entry->ip[entry->nr++] = ip;
 }
 
@@ -1602,22 +1602,10 @@ static const struct stacktrace_ops backtrace_ops = {
 static void
 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
-	unsigned long bp;
-	char *stack;
-	int nr = entry->nr;
-
+	callchain_store(entry, PERF_CONTEXT_KERNEL);
 	callchain_store(entry, regs->ip);
 
-	stack = ((char *)regs + sizeof(struct pt_regs));
-#ifdef CONFIG_FRAME_POINTER
-	get_bp(bp);
-#else
-	bp = 0;
-#endif
-
-	dump_trace(NULL, regs, (void *)&stack, bp, &backtrace_ops, entry);
-
-	entry->kernel = entry->nr - nr;
+	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
 }
 
 /*
@@ -1669,16 +1657,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	struct stack_frame frame;
 	const void __user *fp;
-	int nr = entry->nr;
 
 	if (!user_mode(regs))
 		regs = task_pt_regs(current);
 
 	fp = (void __user *)regs->bp;
 
+	callchain_store(entry, PERF_CONTEXT_USER);
 	callchain_store(entry, regs->ip);
 
-	while (entry->nr < MAX_STACK_DEPTH) {
+	while (entry->nr < PERF_MAX_STACK_DEPTH) {
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
 
@@ -1691,8 +1679,6 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		callchain_store(entry, frame.return_address);
 		fp = frame.next_frame;
 	}
-
-	entry->user = entry->nr - nr;
 }
 
 static void
@@ -1728,9 +1714,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
-	entry->hv = 0;
-	entry->kernel = 0;
-	entry->user = 0;
 
 	perf_do_callchain(regs, entry);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0765e8e69843..e7e7e0242767 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -343,23 +343,22 @@ enum perf_event_type {
 	 *	{ u64			nr;
 	 *	  { u64 id, val; }	cnt[nr];  } && PERF_SAMPLE_GROUP
 	 *
-	 *	{ u16			nr,
-	 *				hv,
-	 *				kernel,
-	 *				user;
+	 *	{ u64			nr,
 	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
 	 * };
 	 */
 };
 
-#define MAX_STACK_DEPTH			255
+enum perf_callchain_context {
+	PERF_CONTEXT_HV			= (__u64)-32,
+	PERF_CONTEXT_KERNEL		= (__u64)-128,
+	PERF_CONTEXT_USER		= (__u64)-512,
 
-struct perf_callchain_entry {
-	__u16				nr;
-	__u16				hv;
-	__u16				kernel;
-	__u16				user;
-	__u64				ip[MAX_STACK_DEPTH];
+	PERF_CONTEXT_GUEST		= (__u64)-2048,
+	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
+	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,
+
+	PERF_CONTEXT_MAX		= (__u64)-4095,
 };
 
 #ifdef __KERNEL__
@@ -381,6 +380,13 @@ struct perf_callchain_entry {
 #include <linux/pid_namespace.h>
 #include <asm/atomic.h>
 
+#define PERF_MAX_STACK_DEPTH		255
+
+struct perf_callchain_entry {
+	__u64				nr;
+	__u64				ip[PERF_MAX_STACK_DEPTH];
+};
+
 struct task_struct;
 
 /**

From 2a0a50fe9def21835d65035cc8109c0b6dd6099d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 18 Jun 2009 22:20:45 +0200
Subject: [PATCH 42/49] perf_counter: Update userspace callchain sampling uses

Update the tools to reflect the new callchain sampling format.

LKML-Reference: <new-submission>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 82 +++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 45 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 86981bd08f65..7a6577bf9a41 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -59,6 +59,11 @@ struct ip_event {
 	unsigned char __more_data[];
 };
 
+struct ip_callchain {
+	__u64 nr;
+	__u64 ips[0];
+};
+
 struct mmap_event {
 	struct perf_event_header header;
 	__u32 pid, tid;
@@ -833,15 +838,12 @@ got_dso:
 	return dso->find_symbol(dso, ip);
 }
 
-static struct symbol *call__match(struct symbol *sym)
+static int call__match(struct symbol *sym)
 {
-	if (!sym)
-		return NULL;
-
 	if (sym->name && !regexec(&parent_regex, sym->name, 0, NULL, 0))
-		return sym;
+		return 1;
 
-	return NULL;
+	return 0;
 }
 
 /*
@@ -850,7 +852,7 @@ static struct symbol *call__match(struct symbol *sym)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, __u64 ip, struct perf_callchain_entry *chain,
+		struct symbol *sym, __u64 ip, struct ip_callchain *chain,
 		char level, __u64 count)
 {
 	struct rb_node **p = &hist.rb_node;
@@ -869,31 +871,35 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 	int cmp;
 
 	if (sort__has_parent && chain) {
-		int i, nr = chain->hv;
-		struct symbol *sym;
-		struct dso *dso;
-		__u64 ip;
+		__u64 context = PERF_CONTEXT_MAX;
+		int i;
+
+		for (i = 0; i < chain->nr; i++) {
+			__u64 ip = chain->ips[i];
+			struct dso *dso = NULL;
+			struct symbol *sym;
+
+			if (ip >= PERF_CONTEXT_MAX) {
+				context = ip;
+				continue;
+			}
+
+			switch (context) {
+			case PERF_CONTEXT_KERNEL:
+				dso = kernel_dso;
+				break;
+			default:
+				break;
+			}
 
-		for (i = 0; i < chain->kernel; i++) {
-			ip = chain->ip[nr + i];
-			dso = kernel_dso;
 			sym = resolve_symbol(thread, NULL, &dso, &ip);
-			entry.parent = call__match(sym);
-			if (entry.parent)
-				goto got_parent;
-		}
-		nr += i;
 
-		for (i = 0; i < chain->user; i++) {
-			ip = chain->ip[nr + i];
-			sym = resolve_symbol(thread, NULL, NULL, &ip);
-			entry.parent = call__match(sym);
-			if (entry.parent)
-				goto got_parent;
+			if (sym && call__match(sym)) {
+				entry.parent = sym;
+				break;
+			}
 		}
-		nr += i;
 	}
-got_parent:
 
 	while (*p != NULL) {
 		parent = *p;
@@ -1095,21 +1101,10 @@ static unsigned long total = 0,
 		     total_unknown = 0,
 		     total_lost = 0;
 
-static int validate_chain(struct perf_callchain_entry *chain, event_t *event)
+static int validate_chain(struct ip_callchain *chain, event_t *event)
 {
 	unsigned int chain_size;
 
-	if (chain->nr > MAX_STACK_DEPTH)
-		return -1;
-	if (chain->hv > MAX_STACK_DEPTH)
-		return -1;
-	if (chain->kernel > MAX_STACK_DEPTH)
-		return -1;
-	if (chain->user > MAX_STACK_DEPTH)
-		return -1;
-	if (chain->hv + chain->kernel + chain->user != chain->nr)
-		return -1;
-
 	chain_size = event->header.size;
 	chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event;
 
@@ -1130,7 +1125,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	__u64 period = 1;
 	struct map *map = NULL;
 	void *more_data = event->ip.__more_data;
-	struct perf_callchain_entry *chain = NULL;
+	struct ip_callchain *chain = NULL;
 
 	if (event->header.type & PERF_SAMPLE_PERIOD) {
 		period = *(__u64 *)more_data;
@@ -1150,10 +1145,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 
 		chain = (void *)more_data;
 
-		dprintf("... chain: u:%d, k:%d, nr:%d\n",
-			chain->user,
-			chain->kernel,
-			chain->nr);
+		dprintf("... chain: nr:%Lu\n", chain->nr);
 
 		if (validate_chain(chain, event) < 0) {
 			eprintf("call-chain problem with event, skipping it.\n");
@@ -1162,7 +1154,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 
 		if (dump_trace) {
 			for (i = 0; i < chain->nr; i++)
-				dprintf("..... %2d: %016Lx\n", i, chain->ip[i]);
+				dprintf("..... %2d: %016Lx\n", i, chain->ips[i]);
 		}
 	}
 

From f5970550d5ccf90453cbd7d260370ea99d1f6513 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 18 Jun 2009 23:22:55 +0200
Subject: [PATCH 43/49] perf_counter tools: Add a data file header

Add a data file header so we can transfer data between record and report.

LKML-Reference: <new-submission>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 94 ++++++++++++++++++++-----------------
 tools/perf/builtin-report.c | 16 ++++++-
 tools/perf/perf.h           |  6 +++
 3 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 06fdfb8b4828..28304677c73e 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -51,6 +51,9 @@ static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
 static int			nr_poll;
 static int			nr_cpu;
 
+static int			file_new = 1;
+static struct perf_file_header	file_header;
+
 struct mmap_event {
 	struct perf_event_header	header;
 	__u32				pid;
@@ -100,6 +103,21 @@ static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
 	pc->data_tail = tail;
 }
 
+static void write_output(void *buf, size_t size)
+{
+	while (size) {
+		int ret = write(output, buf, size);
+
+		if (ret < 0)
+			die("failed to write");
+
+		size -= ret;
+		buf += ret;
+
+		bytes_written += ret;
+	}
+}
+
 static void mmap_read(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
@@ -148,34 +166,14 @@ static void mmap_read(struct mmap_data *md)
 		size = md->mask + 1 - (old & md->mask);
 		old += size;
 
-		while (size) {
-			int ret = write(output, buf, size);
-
-			if (ret < 0)
-				die("failed to write");
-
-			size -= ret;
-			buf += ret;
-
-			bytes_written += ret;
-		}
+		write_output(buf, size);
 	}
 
 	buf = &data[old & md->mask];
 	size = head - old;
 	old += size;
 
-	while (size) {
-		int ret = write(output, buf, size);
-
-		if (ret < 0)
-			die("failed to write");
-
-		size -= ret;
-		buf += ret;
-
-		bytes_written += ret;
-	}
+	write_output(buf, size);
 
 	md->prev = old;
 	mmap_write_tail(md, old);
@@ -204,7 +202,7 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 	struct comm_event comm_ev;
 	char filename[PATH_MAX];
 	char bf[BUFSIZ];
-	int fd, ret;
+	int fd;
 	size_t size;
 	char *field, *sep;
 	DIR *tasks;
@@ -246,11 +244,7 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 	if (!full) {
 		comm_ev.tid = pid;
 
-		ret = write(output, &comm_ev, comm_ev.header.size);
-		if (ret < 0) {
-			perror("failed to write");
-			exit(-1);
-		}
+		write_output(&comm_ev, comm_ev.header.size);
 		return;
 	}
 
@@ -265,11 +259,7 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 
 		comm_ev.tid = pid;
 
-		ret = write(output, &comm_ev, comm_ev.header.size);
-		if (ret < 0) {
-			perror("failed to write");
-			exit(-1);
-		}
+		write_output(&comm_ev, comm_ev.header.size);
 	}
 	closedir(tasks);
 	return;
@@ -332,10 +322,7 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 			mmap_ev.pid = pid;
 			mmap_ev.tid = pid;
 
-			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
-				perror("failed to write");
-				exit(-1);
-			}
+			write_output(&mmap_ev, mmap_ev.header.size);
 		}
 	}
 
@@ -382,6 +369,15 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	if (call_graph)
 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
 
+	if (file_new) {
+		file_header.sample_type = attr->sample_type;
+	} else {
+		if (file_header.sample_type != attr->sample_type) {
+			fprintf(stderr, "incompatible append\n");
+			exit(-1);
+		}
+	}
+
 	attr->mmap		= track;
 	attr->comm		= track;
 	attr->inherit		= (cpu < 0) && inherit;
@@ -461,6 +457,13 @@ static void open_counters(int cpu, pid_t pid)
 	nr_cpu++;
 }
 
+static void atexit_header(void)
+{
+	file_header.data_size += bytes_written;
+
+	pwrite(output, &file_header, sizeof(file_header), 0);
+}
+
 static int __cmd_record(int argc, const char **argv)
 {
 	int i, counter;
@@ -474,6 +477,10 @@ static int __cmd_record(int argc, const char **argv)
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
+	atexit(sig_atexit);
+	signal(SIGCHLD, sig_handler);
+	signal(SIGINT, sig_handler);
+
 	if (!stat(output_name, &st) && !force && !append_file) {
 		fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
 				output_name);
@@ -482,7 +489,7 @@ static int __cmd_record(int argc, const char **argv)
 
 	flags = O_CREAT|O_RDWR;
 	if (append_file)
-		flags |= O_APPEND;
+		file_new = 0;
 	else
 		flags |= O_TRUNC;
 
@@ -492,15 +499,18 @@ static int __cmd_record(int argc, const char **argv)
 		exit(-1);
 	}
 
+	if (!file_new) {
+		read(output, &file_header, sizeof(file_header));
+		lseek(output, file_header.data_size, SEEK_CUR);
+	}
+
+	atexit(atexit_header);
+
 	if (!system_wide) {
 		open_counters(-1, target_pid != -1 ? target_pid : getpid());
 	} else for (i = 0; i < nr_cpus; i++)
 		open_counters(i, target_pid);
 
-	atexit(sig_atexit);
-	signal(SIGCHLD, sig_handler);
-	signal(SIGINT, sig_handler);
-
 	if (target_pid == -1 && argc) {
 		pid = fork();
 		if (pid < 0)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 7a6577bf9a41..37b26ecb0d0b 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1366,11 +1366,13 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static struct perf_file_header		file_header;
+
 static int __cmd_report(void)
 {
 	int ret, rc = EXIT_FAILURE;
 	unsigned long offset = 0;
-	unsigned long head = 0;
+	unsigned long head = sizeof(file_header);
 	struct stat stat;
 	event_t *event;
 	uint32_t size;
@@ -1398,6 +1400,14 @@ static int __cmd_report(void)
 		exit(0);
 	}
 
+	read(input, &file_header, sizeof(file_header));
+
+	if (sort__has_parent &&
+	    !(file_header.sample_type & PERF_SAMPLE_CALLCHAIN)) {
+		fprintf(stderr, "selected --sort parent, but no callchain data\n");
+		exit(-1);
+	}
+
 	if (load_kernel() < 0) {
 		perror("failed to load kernel symbols");
 		return EXIT_FAILURE;
@@ -1469,9 +1479,13 @@ more:
 
 	head += size;
 
+	if (offset + head >= sizeof(file_header) + file_header.data_size)
+		goto done;
+
 	if (offset + head < stat.st_size)
 		goto more;
 
+done:
 	rc = EXIT_SUCCESS;
 	close(input);
 
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 87a1aca4a424..55c62f4b990b 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -65,4 +65,10 @@ sys_perf_counter_open(struct perf_counter_attr *attr,
 #define MAX_COUNTERS			256
 #define MAX_NR_CPUS			256
 
+struct perf_file_header {
+	__u64	version;
+	__u64	sample_type;
+	__u64	data_size;
+};
+
 #endif

From e5289d4a181fb6c0b7a7607649af2ffdc491335c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 19 Jun 2009 13:22:51 +0200
Subject: [PATCH 44/49] perf_counter: Simplify and fix task migration counting

The task migrations counter was causing rare and hard to decypher
memory corruptions under load. After a day of debugging and bisection
we found that the problem was introduced with:

  3f731ca: perf_counter: Fix cpu migration counter

Turning them off fixes the crashes. Incidentally, the whole
perf_counter_task_migration() logic can be done simpler as well,
by injecting a proper sw-counter event.

This cleanup also fixed the crashes. The precise failure mode is
not completely clear yet, but we are clearly not unhappy about
having a fix ;-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 ----
 kernel/perf_counter.c        | 23 +----------------------
 kernel/sched.c               |  3 ++-
 3 files changed, 3 insertions(+), 27 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e7e7e0242767..89698d8aba5c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -682,8 +682,6 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)
 extern void perf_counter_comm(struct task_struct *tsk);
 extern void perf_counter_fork(struct task_struct *tsk);
 
-extern void perf_counter_task_migration(struct task_struct *task, int cpu);
-
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_paranoid;
@@ -724,8 +722,6 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
-static inline void perf_counter_task_migration(struct task_struct *task,
-					       int cpu)			{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7e9108efd305..8d4f0dd41c22 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -124,7 +124,7 @@ void perf_enable(void)
 
 static void get_ctx(struct perf_counter_context *ctx)
 {
-	atomic_inc(&ctx->refcount);
+	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 }
 
 static void free_ctx(struct rcu_head *head)
@@ -3467,27 +3467,6 @@ static const struct pmu perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-/*
- * Software counter: cpu migrations
- */
-void perf_counter_task_migration(struct task_struct *task, int cpu)
-{
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx;
-
-	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
-				 PERF_COUNT_SW_CPU_MIGRATIONS,
-				 1, 1, NULL, 0);
-
-	ctx = perf_pin_task_context(task);
-	if (ctx) {
-		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
-					 PERF_COUNT_SW_CPU_MIGRATIONS,
-					 1, 1, NULL, 0);
-		perf_unpin_context(ctx);
-	}
-}
-
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 8fb88a906aaa..f46540b359c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1978,7 +1978,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 #endif
-		perf_counter_task_migration(p, new_cpu);
+		perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+				     1, 1, NULL, 0);
 	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;

From 0c87197142427063e096f11603543ca874045952 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Jun 2009 11:35:01 +0200
Subject: [PATCH 45/49] perf_counter, x86: Improve interactions with fast-gup

Improve a few details in perfcounter call-chain recording that
makes use of fast-GUP:

- Use ACCESS_ONCE() to observe the pte value. ptes are fundamentally
  racy and can be changed on another CPU, so we have to be careful
  about how we access them. The PAE branch is already careful with
  read-barriers - but the non-PAE and 64-bit side needs an
  ACCESS_ONCE() to make sure the pte value is observed only once.

- make the checks a bit stricter so that we can feed it any kind of
  cra^H^H^H user-space input ;-)

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uaccess.h | 7 ++++++-
 arch/x86/mm/gup.c              | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index b685ece89d5c..512ee87062c2 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -25,7 +25,12 @@
 #define MAKE_MM_SEG(s)	((mm_segment_t) { (s) })
 
 #define KERNEL_DS	MAKE_MM_SEG(-1UL)
-#define USER_DS		MAKE_MM_SEG(PAGE_OFFSET)
+
+#ifdef CONFIG_X86_32
+# define USER_DS	MAKE_MM_SEG(PAGE_OFFSET)
+#else
+# define USER_DS	MAKE_MM_SEG(__VIRTUAL_MASK)
+#endif
 
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current_thread_info()->addr_limit)
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 697d5727c119..2d1d784ad3f7 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -14,7 +14,7 @@
 static inline pte_t gup_get_pte(pte_t *ptep)
 {
 #ifndef CONFIG_X86_PAE
-	return *ptep;
+	return ACCESS_ONCE(*ptep);
 #else
 	/*
 	 * With get_user_pages_fast, we walk down the pagetables without taking

From b49a9e7e72103ea91946453c19703a4dfa1994fe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 19 Jun 2009 17:39:33 +0200
Subject: [PATCH 46/49] perf_counter: Close race in perf_lock_task_context()

perf_lock_task_context() is buggy because it can return a dead
context.

the RCU read lock in perf_lock_task_context() only guarantees
the memory won't get freed, it doesn't guarantee the object is
valid (in our case refcount > 0).

Therefore we can return a locked object that can get freed the
moment we release the rcu read lock.

perf_pin_task_context() then increases the refcount and does an
unlock on freed memory.

That increased refcount will cause a double free, in case it
started out with 0.

Ammend this by including the get_ctx() functionality in
perf_lock_task_context() (all users already did this later
anyway), and return a NULL context when the found one is
already dead.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8d4f0dd41c22..adb6ae506d5b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -175,6 +175,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 			spin_unlock_irqrestore(&ctx->lock, *flags);
 			goto retry;
 		}
+
+		if (!atomic_inc_not_zero(&ctx->refcount)) {
+			spin_unlock_irqrestore(&ctx->lock, *flags);
+			ctx = NULL;
+		}
 	}
 	rcu_read_unlock();
 	return ctx;
@@ -193,7 +198,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta
 	ctx = perf_lock_task_context(task, &flags);
 	if (ctx) {
 		++ctx->pin_count;
-		get_ctx(ctx);
 		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 	return ctx;
@@ -1459,11 +1463,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 			put_ctx(parent_ctx);
 			ctx->parent_ctx = NULL;		/* no longer a clone */
 		}
-		/*
-		 * Get an extra reference before dropping the lock so that
-		 * this context won't get freed if the task exits.
-		 */
-		get_ctx(ctx);
 		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 

From 9cffa8d53335d891cc0ecb3824a67118b3ee4b2f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 19 Jun 2009 22:21:42 +1000
Subject: [PATCH 47/49] perf_counter tools: Define and use our own u64, s64
 etc. definitions

On 64-bit powerpc, __u64 is defined to be unsigned long rather than
unsigned long long.  This causes compiler warnings every time we
print a __u64 value with %Lx.

Rather than changing __u64, we define our own u64 to be unsigned long
long on all architectures, and similarly s64 as signed long long.
For consistency we also define u32, s32, u16, s16, u8 and s8.  These
definitions are put in a new header, types.h, because these definitions
are needed in util/string.h and util/symbol.h.

The main change here is the mechanical change of __[us]{64,32,16,8}
to remove the "__".  The other changes are:

* Create types.h
* Include types.h in perf.h, util/string.h and util/symbol.h
* Add types.h to the LIB_H definition in Makefile
* Added (u64) casts in process_overflow_event() and print_sym_table()
  to kill two remaining warnings.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: benh@kernel.crashing.org
LKML-Reference: <19003.33494.495844.956580@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Makefile            |  1 +
 tools/perf/builtin-annotate.c  | 66 +++++++++++++-------------
 tools/perf/builtin-record.c    | 20 ++++----
 tools/perf/builtin-report.c    | 84 +++++++++++++++++-----------------
 tools/perf/builtin-stat.c      | 62 ++++++++++++-------------
 tools/perf/builtin-top.c       | 24 +++++-----
 tools/perf/perf.h              |  7 +--
 tools/perf/types.h             | 17 +++++++
 tools/perf/util/parse-events.c | 10 ++--
 tools/perf/util/string.c       |  2 +-
 tools/perf/util/string.h       |  4 +-
 tools/perf/util/symbol.c       | 20 ++++----
 tools/perf/util/symbol.h       | 15 +++---
 13 files changed, 176 insertions(+), 156 deletions(-)
 create mode 100644 tools/perf/types.h

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 672c5f069c6e..36d7eef49913 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -290,6 +290,7 @@ LIB_FILE=libperf.a
 
 LIB_H += ../../include/linux/perf_counter.h
 LIB_H += perf.h
+LIB_H += types.h
 LIB_H += util/list.h
 LIB_H += util/rbtree.h
 LIB_H += util/levenshtein.h
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 94cea678fd7e..7e58e3ad1508 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -50,35 +50,35 @@ static unsigned long	mmap_window = 32;
 
 struct ip_event {
 	struct perf_event_header header;
-	__u64 ip;
-	__u32 pid, tid;
+	u64 ip;
+	u32 pid, tid;
 };
 
 struct mmap_event {
 	struct perf_event_header header;
-	__u32 pid, tid;
-	__u64 start;
-	__u64 len;
-	__u64 pgoff;
+	u32 pid, tid;
+	u64 start;
+	u64 len;
+	u64 pgoff;
 	char filename[PATH_MAX];
 };
 
 struct comm_event {
 	struct perf_event_header header;
-	__u32 pid, tid;
+	u32 pid, tid;
 	char comm[16];
 };
 
 struct fork_event {
 	struct perf_event_header header;
-	__u32 pid, ppid;
+	u32 pid, ppid;
 };
 
 struct period_event {
 	struct perf_event_header header;
-	__u64 time;
-	__u64 id;
-	__u64 sample_period;
+	u64 time;
+	u64 id;
+	u64 sample_period;
 };
 
 typedef union event_union {
@@ -158,7 +158,7 @@ static void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
-static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
+static struct symbol *vdso__find_symbol(struct dso *dso, u64 ip)
 {
 	return dso__find_symbol(kernel_dso, ip);
 }
@@ -191,19 +191,19 @@ static int load_kernel(void)
 
 struct map {
 	struct list_head node;
-	__u64	 start;
-	__u64	 end;
-	__u64	 pgoff;
-	__u64	 (*map_ip)(struct map *, __u64);
+	u64	 start;
+	u64	 end;
+	u64	 pgoff;
+	u64	 (*map_ip)(struct map *, u64);
 	struct dso	 *dso;
 };
 
-static __u64 map__map_ip(struct map *map, __u64 ip)
+static u64 map__map_ip(struct map *map, u64 ip)
 {
 	return ip - map->start + map->pgoff;
 }
 
-static __u64 vdso__map_ip(struct map *map, __u64 ip)
+static u64 vdso__map_ip(struct map *map, u64 ip)
 {
 	return ip;
 }
@@ -386,7 +386,7 @@ static int thread__fork(struct thread *self, struct thread *parent)
 	return 0;
 }
 
-static struct map *thread__find_map(struct thread *self, __u64 ip)
+static struct map *thread__find_map(struct thread *self, u64 ip)
 {
 	struct map *pos;
 
@@ -427,7 +427,7 @@ struct hist_entry {
 	struct map	 *map;
 	struct dso	 *dso;
 	struct symbol	 *sym;
-	__u64	 ip;
+	u64	 ip;
 	char		 level;
 
 	uint32_t	 count;
@@ -532,7 +532,7 @@ sort__dso_print(FILE *fp, struct hist_entry *self)
 	if (self->dso)
 		return fprintf(fp, "%-25s", self->dso->name);
 
-	return fprintf(fp, "%016llx         ", (__u64)self->ip);
+	return fprintf(fp, "%016llx         ", (u64)self->ip);
 }
 
 static struct sort_entry sort_dso = {
@@ -546,7 +546,7 @@ static struct sort_entry sort_dso = {
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	__u64 ip_l, ip_r;
+	u64 ip_l, ip_r;
 
 	if (left->sym == right->sym)
 		return 0;
@@ -563,13 +563,13 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	size_t ret = 0;
 
 	if (verbose)
-		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
+		ret += fprintf(fp, "%#018llx  ", (u64)self->ip);
 
 	if (self->sym) {
 		ret += fprintf(fp, "[%c] %s",
 			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
 	} else {
-		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+		ret += fprintf(fp, "%#016llx", (u64)self->ip);
 	}
 
 	return ret;
@@ -660,7 +660,7 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 /*
  * collect histogram counts
  */
-static void hist_hit(struct hist_entry *he, __u64 ip)
+static void hist_hit(struct hist_entry *he, u64 ip)
 {
 	unsigned int sym_size, offset;
 	struct symbol *sym = he->sym;
@@ -689,7 +689,7 @@ static void hist_hit(struct hist_entry *he, __u64 ip)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, __u64 ip, char level)
+		struct symbol *sym, u64 ip, char level)
 {
 	struct rb_node **p = &hist.rb_node;
 	struct rb_node *parent = NULL;
@@ -861,7 +861,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	int show = 0;
 	struct dso *dso = NULL;
 	struct thread *thread = threads__findnew(event->ip.pid);
-	__u64 ip = event->ip.ip;
+	u64 ip = event->ip.ip;
 	struct map *map = NULL;
 
 	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
@@ -1062,14 +1062,14 @@ static char *get_color(double percent)
 }
 
 static int
-parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
+parse_line(FILE *file, struct symbol *sym, u64 start, u64 len)
 {
 	char *line = NULL, *tmp, *tmp2;
 	static const char *prev_line;
 	static const char *prev_color;
 	unsigned int offset;
 	size_t line_len;
-	__u64 line_ip;
+	u64 line_ip;
 	int ret;
 	char *c;
 
@@ -1191,7 +1191,7 @@ static void free_source_line(struct symbol *sym, int len)
 
 /* Get the filename:line for the colored entries */
 static void
-get_source_line(struct symbol *sym, __u64 start, int len, char *filename)
+get_source_line(struct symbol *sym, u64 start, int len, char *filename)
 {
 	int i;
 	char cmd[PATH_MAX * 2];
@@ -1209,7 +1209,7 @@ get_source_line(struct symbol *sym, __u64 start, int len, char *filename)
 	for (i = 0; i < len; i++) {
 		char *path = NULL;
 		size_t line_len;
-		__u64 offset;
+		u64 offset;
 		FILE *fp;
 
 		sym_ext[i].percent = 100.0 * sym->hist[i] / sym->hist_sum;
@@ -1269,7 +1269,7 @@ static void print_summary(char *filename)
 static void annotate_sym(struct dso *dso, struct symbol *sym)
 {
 	char *filename = dso->name;
-	__u64 start, end, len;
+	u64 start, end, len;
 	char command[PATH_MAX*2];
 	FILE *file;
 
@@ -1297,7 +1297,7 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
 	if (verbose >= 2)
 		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
 
-	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
+	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (u64)start, (u64)end, filename);
 
 	if (verbose >= 3)
 		printf("doing: %s\n", command);
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 28304677c73e..e2cebc053bd7 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -44,7 +44,7 @@ static long			samples;
 static struct timeval		last_read;
 static struct timeval		this_read;
 
-static __u64			bytes_written;
+static u64			bytes_written;
 
 static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
 
@@ -56,18 +56,18 @@ static struct perf_file_header	file_header;
 
 struct mmap_event {
 	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	__u64				start;
-	__u64				len;
-	__u64				pgoff;
+	u32				pid;
+	u32				tid;
+	u64				start;
+	u64				len;
+	u64				pgoff;
 	char				filename[PATH_MAX];
 };
 
 struct comm_event {
 	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
+	u32				pid;
+	u32				tid;
 	char				comm[16];
 };
 
@@ -238,7 +238,7 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 
 	comm_ev.pid = pid;
 	comm_ev.header.type = PERF_EVENT_COMM;
-	size = ALIGN(size, sizeof(__u64));
+	size = ALIGN(size, sizeof(u64));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
 
 	if (!full) {
@@ -315,7 +315,7 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 			size = strlen(execname);
 			execname[size - 1] = '\0'; /* Remove \n */
 			memcpy(mmap_ev.filename, execname, size);
-			size = ALIGN(size, sizeof(__u64));
+			size = ALIGN(size, sizeof(u64));
 			mmap_ev.len -= mmap_ev.start;
 			mmap_ev.header.size = (sizeof(mmap_ev) -
 					       (sizeof(mmap_ev.filename) - size));
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 37b26ecb0d0b..de1b97845e9e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -54,47 +54,47 @@ static int		exclude_other = 1;
 
 struct ip_event {
 	struct perf_event_header header;
-	__u64 ip;
-	__u32 pid, tid;
+	u64 ip;
+	u32 pid, tid;
 	unsigned char __more_data[];
 };
 
 struct ip_callchain {
-	__u64 nr;
-	__u64 ips[0];
+	u64 nr;
+	u64 ips[0];
 };
 
 struct mmap_event {
 	struct perf_event_header header;
-	__u32 pid, tid;
-	__u64 start;
-	__u64 len;
-	__u64 pgoff;
+	u32 pid, tid;
+	u64 start;
+	u64 len;
+	u64 pgoff;
 	char filename[PATH_MAX];
 };
 
 struct comm_event {
 	struct perf_event_header header;
-	__u32 pid, tid;
+	u32 pid, tid;
 	char comm[16];
 };
 
 struct fork_event {
 	struct perf_event_header header;
-	__u32 pid, ppid;
+	u32 pid, ppid;
 };
 
 struct period_event {
 	struct perf_event_header header;
-	__u64 time;
-	__u64 id;
-	__u64 sample_period;
+	u64 time;
+	u64 id;
+	u64 sample_period;
 };
 
 struct lost_event {
 	struct perf_event_header header;
-	__u64 id;
-	__u64 lost;
+	u64 id;
+	u64 lost;
 };
 
 typedef union event_union {
@@ -163,7 +163,7 @@ static void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
-static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
+static struct symbol *vdso__find_symbol(struct dso *dso, u64 ip)
 {
 	return dso__find_symbol(kernel_dso, ip);
 }
@@ -210,19 +210,19 @@ static int strcommon(const char *pathname)
 
 struct map {
 	struct list_head node;
-	__u64	 start;
-	__u64	 end;
-	__u64	 pgoff;
-	__u64	 (*map_ip)(struct map *, __u64);
+	u64	 start;
+	u64	 end;
+	u64	 pgoff;
+	u64	 (*map_ip)(struct map *, u64);
 	struct dso	 *dso;
 };
 
-static __u64 map__map_ip(struct map *map, __u64 ip)
+static u64 map__map_ip(struct map *map, u64 ip)
 {
 	return ip - map->start + map->pgoff;
 }
 
-static __u64 vdso__map_ip(struct map *map, __u64 ip)
+static u64 vdso__map_ip(struct map *map, u64 ip)
 {
 	return ip;
 }
@@ -429,7 +429,7 @@ static int thread__fork(struct thread *self, struct thread *parent)
 	return 0;
 }
 
-static struct map *thread__find_map(struct thread *self, __u64 ip)
+static struct map *thread__find_map(struct thread *self, u64 ip)
 {
 	struct map *pos;
 
@@ -471,10 +471,10 @@ struct hist_entry {
 	struct dso	 *dso;
 	struct symbol	 *sym;
 	struct symbol	 *parent;
-	__u64		 ip;
+	u64		 ip;
 	char		 level;
 
-	__u64		 count;
+	u64		 count;
 };
 
 /*
@@ -574,7 +574,7 @@ sort__dso_print(FILE *fp, struct hist_entry *self)
 	if (self->dso)
 		return fprintf(fp, "%-25s", self->dso->name);
 
-	return fprintf(fp, "%016llx         ", (__u64)self->ip);
+	return fprintf(fp, "%016llx         ", (u64)self->ip);
 }
 
 static struct sort_entry sort_dso = {
@@ -588,7 +588,7 @@ static struct sort_entry sort_dso = {
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	__u64 ip_l, ip_r;
+	u64 ip_l, ip_r;
 
 	if (left->sym == right->sym)
 		return 0;
@@ -605,13 +605,13 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	size_t ret = 0;
 
 	if (verbose)
-		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
+		ret += fprintf(fp, "%#018llx  ", (u64)self->ip);
 
 	if (self->sym) {
 		ret += fprintf(fp, "[%c] %s",
 			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
 	} else {
-		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+		ret += fprintf(fp, "%#016llx", (u64)self->ip);
 	}
 
 	return ret;
@@ -745,7 +745,7 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 }
 
 static size_t
-hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples)
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, u64 total_samples)
 {
 	struct sort_entry *se;
 	size_t ret;
@@ -793,7 +793,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples)
 
 static struct symbol *
 resolve_symbol(struct thread *thread, struct map **mapp,
-	       struct dso **dsop, __u64 *ipp)
+	       struct dso **dsop, u64 *ipp)
 {
 	struct dso *dso = dsop ? *dsop : NULL;
 	struct map *map = mapp ? *mapp : NULL;
@@ -852,8 +852,8 @@ static int call__match(struct symbol *sym)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, __u64 ip, struct ip_callchain *chain,
-		char level, __u64 count)
+		struct symbol *sym, u64 ip, struct ip_callchain *chain,
+		char level, u64 count)
 {
 	struct rb_node **p = &hist.rb_node;
 	struct rb_node *parent = NULL;
@@ -871,11 +871,11 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 	int cmp;
 
 	if (sort__has_parent && chain) {
-		__u64 context = PERF_CONTEXT_MAX;
+		u64 context = PERF_CONTEXT_MAX;
 		int i;
 
 		for (i = 0; i < chain->nr; i++) {
-			__u64 ip = chain->ips[i];
+			u64 ip = chain->ips[i];
 			struct dso *dso = NULL;
 			struct symbol *sym;
 
@@ -1032,7 +1032,7 @@ static void output__resort(void)
 	}
 }
 
-static size_t output__fprintf(FILE *fp, __u64 total_samples)
+static size_t output__fprintf(FILE *fp, u64 total_samples)
 {
 	struct hist_entry *pos;
 	struct sort_entry *se;
@@ -1041,7 +1041,7 @@ static size_t output__fprintf(FILE *fp, __u64 total_samples)
 
 	fprintf(fp, "\n");
 	fprintf(fp, "#\n");
-	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
+	fprintf(fp, "# (%Ld samples)\n", (u64)total_samples);
 	fprintf(fp, "#\n");
 
 	fprintf(fp, "# Overhead");
@@ -1108,7 +1108,7 @@ static int validate_chain(struct ip_callchain *chain, event_t *event)
 	chain_size = event->header.size;
 	chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event;
 
-	if (chain->nr*sizeof(__u64) > chain_size)
+	if (chain->nr*sizeof(u64) > chain_size)
 		return -1;
 
 	return 0;
@@ -1121,15 +1121,15 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	int show = 0;
 	struct dso *dso = NULL;
 	struct thread *thread = threads__findnew(event->ip.pid);
-	__u64 ip = event->ip.ip;
-	__u64 period = 1;
+	u64 ip = event->ip.ip;
+	u64 period = 1;
 	struct map *map = NULL;
 	void *more_data = event->ip.__more_data;
 	struct ip_callchain *chain = NULL;
 
 	if (event->header.type & PERF_SAMPLE_PERIOD) {
-		period = *(__u64 *)more_data;
-		more_data += sizeof(__u64);
+		period = *(u64 *)more_data;
+		more_data += sizeof(u64);
 	}
 
 	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index e5b3c0ff03a9..6d3eeac1ea25 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -85,29 +85,29 @@ static const unsigned int default_count[] = {
 static int			run_count		=  1;
 static int			run_idx			=  0;
 
-static __u64			event_res[MAX_RUN][MAX_COUNTERS][3];
-static __u64			event_scaled[MAX_RUN][MAX_COUNTERS];
+static u64			event_res[MAX_RUN][MAX_COUNTERS][3];
+static u64			event_scaled[MAX_RUN][MAX_COUNTERS];
 
-//static __u64			event_hist[MAX_RUN][MAX_COUNTERS][3];
+//static u64			event_hist[MAX_RUN][MAX_COUNTERS][3];
 
 
-static __u64			runtime_nsecs[MAX_RUN];
-static __u64			walltime_nsecs[MAX_RUN];
-static __u64			runtime_cycles[MAX_RUN];
+static u64			runtime_nsecs[MAX_RUN];
+static u64			walltime_nsecs[MAX_RUN];
+static u64			runtime_cycles[MAX_RUN];
 
-static __u64			event_res_avg[MAX_COUNTERS][3];
-static __u64			event_res_noise[MAX_COUNTERS][3];
+static u64			event_res_avg[MAX_COUNTERS][3];
+static u64			event_res_noise[MAX_COUNTERS][3];
 
-static __u64			event_scaled_avg[MAX_COUNTERS];
+static u64			event_scaled_avg[MAX_COUNTERS];
 
-static __u64			runtime_nsecs_avg;
-static __u64			runtime_nsecs_noise;
+static u64			runtime_nsecs_avg;
+static u64			runtime_nsecs_noise;
 
-static __u64			walltime_nsecs_avg;
-static __u64			walltime_nsecs_noise;
+static u64			walltime_nsecs_avg;
+static u64			walltime_nsecs_noise;
 
-static __u64			runtime_cycles_avg;
-static __u64			runtime_cycles_noise;
+static u64			runtime_cycles_avg;
+static u64			runtime_cycles_noise;
 
 static void create_perf_stat_counter(int counter)
 {
@@ -158,7 +158,7 @@ static inline int nsec_counter(int counter)
  */
 static void read_counter(int counter)
 {
-	__u64 *count, single_count[3];
+	u64 *count, single_count[3];
 	ssize_t res;
 	int cpu, nv;
 	int scaled;
@@ -172,8 +172,8 @@ static void read_counter(int counter)
 		if (fd[cpu][counter] < 0)
 			continue;
 
-		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
-		assert(res == nv * sizeof(__u64));
+		res = read(fd[cpu][counter], single_count, nv * sizeof(u64));
+		assert(res == nv * sizeof(u64));
 		close(fd[cpu][counter]);
 		fd[cpu][counter] = -1;
 
@@ -251,14 +251,14 @@ static int run_perf_stat(int argc, const char **argv)
 	return WEXITSTATUS(status);
 }
 
-static void print_noise(__u64 *count, __u64 *noise)
+static void print_noise(u64 *count, u64 *noise)
 {
 	if (run_count > 1)
 		fprintf(stderr, "   ( +- %7.3f%% )",
 			(double)noise[0]/(count[0]+1)*100.0);
 }
 
-static void nsec_printout(int counter, __u64 *count, __u64 *noise)
+static void nsec_printout(int counter, u64 *count, u64 *noise)
 {
 	double msecs = (double)count[0] / 1000000;
 
@@ -274,7 +274,7 @@ static void nsec_printout(int counter, __u64 *count, __u64 *noise)
 	print_noise(count, noise);
 }
 
-static void abs_printout(int counter, __u64 *count, __u64 *noise)
+static void abs_printout(int counter, u64 *count, u64 *noise)
 {
 	fprintf(stderr, " %14Ld  %-20s", count[0], event_name(counter));
 
@@ -298,7 +298,7 @@ static void abs_printout(int counter, __u64 *count, __u64 *noise)
  */
 static void print_counter(int counter)
 {
-	__u64 *count, *noise;
+	u64 *count, *noise;
 	int scaled;
 
 	count = event_res_avg[counter];
@@ -326,16 +326,16 @@ static void print_counter(int counter)
 /*
  * normalize_noise noise values down to stddev:
  */
-static void normalize_noise(__u64 *val)
+static void normalize_noise(u64 *val)
 {
 	double res;
 
 	res = (double)*val / (run_count * sqrt((double)run_count));
 
-	*val = (__u64)res;
+	*val = (u64)res;
 }
 
-static void update_avg(const char *name, int idx, __u64 *avg, __u64 *val)
+static void update_avg(const char *name, int idx, u64 *avg, u64 *val)
 {
 	*avg += *val;
 
@@ -380,19 +380,19 @@ static void calc_avg(void)
 
 	for (i = 0; i < run_count; i++) {
 		runtime_nsecs_noise +=
-			abs((__s64)(runtime_nsecs[i] - runtime_nsecs_avg));
+			abs((s64)(runtime_nsecs[i] - runtime_nsecs_avg));
 		walltime_nsecs_noise +=
-			abs((__s64)(walltime_nsecs[i] - walltime_nsecs_avg));
+			abs((s64)(walltime_nsecs[i] - walltime_nsecs_avg));
 		runtime_cycles_noise +=
-			abs((__s64)(runtime_cycles[i] - runtime_cycles_avg));
+			abs((s64)(runtime_cycles[i] - runtime_cycles_avg));
 
 		for (j = 0; j < nr_counters; j++) {
 			event_res_noise[j][0] +=
-				abs((__s64)(event_res[i][j][0] - event_res_avg[j][0]));
+				abs((s64)(event_res[i][j][0] - event_res_avg[j][0]));
 			event_res_noise[j][1] +=
-				abs((__s64)(event_res[i][j][1] - event_res_avg[j][1]));
+				abs((s64)(event_res[i][j][1] - event_res_avg[j][1]));
 			event_res_noise[j][2] +=
-				abs((__s64)(event_res[i][j][2] - event_res_avg[j][2]));
+				abs((s64)(event_res[i][j][2] - event_res_avg[j][2]));
 		}
 	}
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index fe338d3c5d7e..5352b5e352ed 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -54,7 +54,7 @@ static int			system_wide			=  0;
 
 static int			default_interval		= 100000;
 
-static __u64			count_filter			=  5;
+static u64			count_filter			=  5;
 static int			print_entries			= 15;
 
 static int			target_pid			= -1;
@@ -79,8 +79,8 @@ static int			dump_symtab;
  * Symbols
  */
 
-static __u64			min_ip;
-static __u64			max_ip = -1ll;
+static u64			min_ip;
+static u64			max_ip = -1ll;
 
 struct sym_entry {
 	struct rb_node		rb_node;
@@ -194,7 +194,7 @@ static void print_sym_table(void)
 		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
 
 	if (nr_counters == 1) {
-		printf("%Ld", attrs[0].sample_period);
+		printf("%Ld", (u64)attrs[0].sample_period);
 		if (freq)
 			printf("Hz ");
 		else
@@ -372,7 +372,7 @@ out_delete_dso:
 /*
  * Binary search in the histogram table and record the hit:
  */
-static void record_ip(__u64 ip, int counter)
+static void record_ip(u64 ip, int counter)
 {
 	struct symbol *sym = dso__find_symbol(kernel_dso, ip);
 
@@ -392,7 +392,7 @@ static void record_ip(__u64 ip, int counter)
 	samples--;
 }
 
-static void process_event(__u64 ip, int counter)
+static void process_event(u64 ip, int counter)
 {
 	samples++;
 
@@ -463,15 +463,15 @@ static void mmap_read_counter(struct mmap_data *md)
 	for (; old != head;) {
 		struct ip_event {
 			struct perf_event_header header;
-			__u64 ip;
-			__u32 pid, target_pid;
+			u64 ip;
+			u32 pid, target_pid;
 		};
 		struct mmap_event {
 			struct perf_event_header header;
-			__u32 pid, target_pid;
-			__u64 start;
-			__u64 len;
-			__u64 pgoff;
+			u32 pid, target_pid;
+			u64 start;
+			u64 len;
+			u64 pgoff;
 			char filename[PATH_MAX];
 		};
 
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 55c62f4b990b..bccb529dac08 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -19,6 +19,7 @@
 #include <sys/syscall.h>
 
 #include "../../include/linux/perf_counter.h"
+#include "types.h"
 
 /*
  * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
@@ -66,9 +67,9 @@ sys_perf_counter_open(struct perf_counter_attr *attr,
 #define MAX_NR_CPUS			256
 
 struct perf_file_header {
-	__u64	version;
-	__u64	sample_type;
-	__u64	data_size;
+	u64	version;
+	u64	sample_type;
+	u64	data_size;
 };
 
 #endif
diff --git a/tools/perf/types.h b/tools/perf/types.h
new file mode 100644
index 000000000000..5e75f9005940
--- /dev/null
+++ b/tools/perf/types.h
@@ -0,0 +1,17 @@
+#ifndef _PERF_TYPES_H
+#define _PERF_TYPES_H
+
+/*
+ * We define u64 as unsigned long long for every architecture
+ * so that we can print it with %Lx without getting warnings.
+ */
+typedef unsigned long long u64;
+typedef signed long long   s64;
+typedef unsigned int	   u32;
+typedef signed int	   s32;
+typedef unsigned short	   u16;
+typedef signed short	   s16;
+typedef unsigned char	   u8;
+typedef signed char	   s8;
+
+#endif /* _PERF_TYPES_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f0c9f2627fe1..35d04da38d6a 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -13,8 +13,8 @@ int					nr_counters;
 struct perf_counter_attr		attrs[MAX_COUNTERS];
 
 struct event_symbol {
-	__u8	type;
-	__u64	config;
+	u8	type;
+	u64	config;
 	char	*symbol;
 };
 
@@ -96,7 +96,7 @@ static char *hw_cache_result [][MAX_ALIASES] = {
 
 char *event_name(int counter)
 {
-	__u64 config = attrs[counter].config;
+	u64 config = attrs[counter].config;
 	int type = attrs[counter].type;
 	static char buf[32];
 
@@ -112,7 +112,7 @@ char *event_name(int counter)
 		return "unknown-hardware";
 
 	case PERF_TYPE_HW_CACHE: {
-		__u8 cache_type, cache_op, cache_result;
+		u8 cache_type, cache_op, cache_result;
 		static char name[100];
 
 		cache_type   = (config >>  0) & 0xff;
@@ -202,7 +202,7 @@ static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *a
  */
 static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
 {
-	__u64 config, id;
+	u64 config, id;
 	int type;
 	unsigned int i;
 	const char *sep, *pstr;
diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c
index ec33c0c7f4e2..c93eca9a7be3 100644
--- a/tools/perf/util/string.c
+++ b/tools/perf/util/string.c
@@ -15,7 +15,7 @@ static int hex(char ch)
  * While we find nice hex chars, build a long_val.
  * Return number of chars processed.
  */
-int hex2u64(const char *ptr, __u64 *long_val)
+int hex2u64(const char *ptr, u64 *long_val)
 {
 	const char *p = ptr;
 	*long_val = 0;
diff --git a/tools/perf/util/string.h b/tools/perf/util/string.h
index 72812c1c9a7a..37b03255b425 100644
--- a/tools/perf/util/string.h
+++ b/tools/perf/util/string.h
@@ -1,8 +1,8 @@
 #ifndef _PERF_STRING_H_
 #define _PERF_STRING_H_
 
-#include <linux/types.h>
+#include "../types.h"
 
-int hex2u64(const char *ptr, __u64 *val);
+int hex2u64(const char *ptr, u64 *val);
 
 #endif
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 49a55f813712..86e14375e74e 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -9,9 +9,9 @@
 
 const char *sym_hist_filter;
 
-static struct symbol *symbol__new(__u64 start, __u64 len,
+static struct symbol *symbol__new(u64 start, u64 len,
 				  const char *name, unsigned int priv_size,
-				  __u64 obj_start, int verbose)
+				  u64 obj_start, int verbose)
 {
 	size_t namelen = strlen(name) + 1;
 	struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
@@ -21,14 +21,14 @@ static struct symbol *symbol__new(__u64 start, __u64 len,
 
 	if (verbose >= 2)
 		printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
-			(__u64)start, (unsigned long)len, name, self->hist, (void *)(unsigned long)obj_start);
+			(u64)start, (unsigned long)len, name, self->hist, (void *)(unsigned long)obj_start);
 
 	self->obj_start= obj_start;
 	self->hist = NULL;
 	self->hist_sum = 0;
 
 	if (sym_hist_filter && !strcmp(name, sym_hist_filter))
-		self->hist = calloc(sizeof(__u64), len);
+		self->hist = calloc(sizeof(u64), len);
 
 	if (priv_size) {
 		memset(self, 0, priv_size);
@@ -89,7 +89,7 @@ static void dso__insert_symbol(struct dso *self, struct symbol *sym)
 {
 	struct rb_node **p = &self->syms.rb_node;
 	struct rb_node *parent = NULL;
-	const __u64 ip = sym->start;
+	const u64 ip = sym->start;
 	struct symbol *s;
 
 	while (*p != NULL) {
@@ -104,7 +104,7 @@ static void dso__insert_symbol(struct dso *self, struct symbol *sym)
 	rb_insert_color(&sym->rb_node, &self->syms);
 }
 
-struct symbol *dso__find_symbol(struct dso *self, __u64 ip)
+struct symbol *dso__find_symbol(struct dso *self, u64 ip)
 {
 	struct rb_node *n;
 
@@ -151,7 +151,7 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verb
 		goto out_failure;
 
 	while (!feof(file)) {
-		__u64 start;
+		u64 start;
 		struct symbol *sym;
 		int line_len, len;
 		char symbol_type;
@@ -232,7 +232,7 @@ static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int verb
 		goto out_failure;
 
 	while (!feof(file)) {
-		__u64 start, size;
+		u64 start, size;
 		struct symbol *sym;
 		int line_len, len;
 
@@ -353,7 +353,7 @@ static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 {
 	uint32_t nr_rel_entries, idx;
 	GElf_Sym sym;
-	__u64 plt_offset;
+	u64 plt_offset;
 	GElf_Shdr shdr_plt;
 	struct symbol *f;
 	GElf_Shdr shdr_rel_plt;
@@ -523,7 +523,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
-		__u64 obj_start;
+		u64 obj_start;
 
 		if (!elf_sym__is_function(&sym))
 			continue;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 5ad9b06c3f6f..ea332e56e458 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -2,16 +2,17 @@
 #define _PERF_SYMBOL_ 1
 
 #include <linux/types.h>
+#include "../types.h"
 #include "list.h"
 #include "rbtree.h"
 
 struct symbol {
 	struct rb_node	rb_node;
-	__u64		start;
-	__u64		end;
-	__u64		obj_start;
-	__u64		hist_sum;
-	__u64		*hist;
+	u64		start;
+	u64		end;
+	u64		obj_start;
+	u64		hist_sum;
+	u64		*hist;
 	void		*priv;
 	char		name[0];
 };
@@ -20,7 +21,7 @@ struct dso {
 	struct list_head node;
 	struct rb_root	 syms;
 	unsigned int	 sym_priv_size;
-	struct symbol    *(*find_symbol)(struct dso *, __u64 ip);
+	struct symbol    *(*find_symbol)(struct dso *, u64 ip);
 	char		 name[0];
 };
 
@@ -36,7 +37,7 @@ static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
 	return ((void *)sym) - self->sym_priv_size;
 }
 
-struct symbol *dso__find_symbol(struct dso *self, __u64 ip);
+struct symbol *dso__find_symbol(struct dso *self, u64 ip);
 
 int dso__load_kernel(struct dso *self, const char *vmlinux,
 		     symbol_filter_t filter, int verbose);

From 92bf309a9cd5fedd6c8eefbce0b9a95ada82d0a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 19 Jun 2009 18:11:53 +0200
Subject: [PATCH 48/49] perf_counter: Push perf_sample_data through the
 swcounter code

Push the perf_sample_data further outwards to the swcounter interface,
to abstract it away some more.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 55 +++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index adb6ae506d5b..1a933a221ea4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3171,20 +3171,15 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct pt_regs *regs, u64 addr)
+				    int nmi, struct perf_sample_data *data)
 {
-	struct perf_sample_data data = {
-		.regs	= regs,
-		.addr	= addr,
-		.period	= counter->hw.last_period,
-	};
+	data->period = counter->hw.last_period;
 
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, &data))
+	if (perf_counter_overflow(counter, nmi, data))
 		/* soft-disable the counter */
 		;
-
 }
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3249,18 +3244,18 @@ static int perf_swcounter_match(struct perf_counter *counter,
 }
 
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct pt_regs *regs, u64 addr)
+			       int nmi, struct perf_sample_data *data)
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 
-	if (counter->hw.sample_period && !neg && regs)
-		perf_swcounter_overflow(counter, nmi, regs, addr);
+	if (counter->hw.sample_period && !neg && data->regs)
+		perf_swcounter_overflow(counter, nmi, data);
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum perf_type_id type, u32 event,
-				     u64 nr, int nmi, struct pt_regs *regs,
-				     u64 addr)
+				     enum perf_type_id type,
+				     u32 event, u64 nr, int nmi,
+				     struct perf_sample_data *data)
 {
 	struct perf_counter *counter;
 
@@ -3269,8 +3264,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, type, event, regs))
-			perf_swcounter_add(counter, nr, nmi, regs, addr);
+		if (perf_swcounter_match(counter, type, event, data->regs))
+			perf_swcounter_add(counter, nr, nmi, data);
 	}
 	rcu_read_unlock();
 }
@@ -3289,9 +3284,9 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 	return &cpuctx->recursion[0];
 }
 
-static void __perf_swcounter_event(enum perf_type_id type, u32 event,
-				   u64 nr, int nmi, struct pt_regs *regs,
-				   u64 addr)
+static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3304,7 +3299,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
 	barrier();
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-				 nr, nmi, regs, addr);
+				 nr, nmi, data);
 	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
@@ -3312,7 +3307,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
 	 */
 	ctx = rcu_dereference(current->perf_counter_ctxp);
 	if (ctx)
-		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
+		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
 	rcu_read_unlock();
 
 	barrier();
@@ -3325,7 +3320,12 @@ out:
 void
 perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 {
-	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
+	struct perf_sample_data data = {
+		.regs = regs,
+		.addr = addr,
+	};
+
+	do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
 }
 
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -3469,12 +3469,15 @@ static const struct pmu perf_ops_task_clock = {
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
-	struct pt_regs *regs = get_irq_regs();
+	struct perf_sample_data data = {
+		.regs = get_irq_regs();
+		.addr = 0,
+	};
 
-	if (!regs)
-		regs = task_pt_regs(current);
+	if (!data.regs)
+		data.regs = task_pt_regs(current);
 
-	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
+	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 

From eadc84cc01e04f9f74ec2de0c9355be035c7b396 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 20 Jun 2009 02:01:40 +0200
Subject: [PATCH 49/49] perfcounter: Handle some IO return values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Building perfcounter tools raises the following warnings:

 builtin-record.c: In function ‘atexit_header’:
 builtin-record.c:464: erreur: ignoring return value of ‘pwrite’, declared with attribute warn_unused_result
 builtin-record.c: In function ‘__cmd_record’:
 builtin-record.c:503: erreur: ignoring return value of ‘read’, declared with attribute warn_unused_result

 builtin-report.c: In function ‘__cmd_report’:
 builtin-report.c:1403: erreur: ignoring return value of ‘read’, declared with attribute warn_unused_result

This patch handles these IO return values.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1245456100-5477-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 9 +++++++--
 tools/perf/builtin-report.c | 5 ++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index e2cebc053bd7..d7ebbd757543 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -461,7 +461,8 @@ static void atexit_header(void)
 {
 	file_header.data_size += bytes_written;
 
-	pwrite(output, &file_header, sizeof(file_header), 0);
+	if (pwrite(output, &file_header, sizeof(file_header), 0) == -1)
+		perror("failed to write on file headers");
 }
 
 static int __cmd_record(int argc, const char **argv)
@@ -500,7 +501,11 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	if (!file_new) {
-		read(output, &file_header, sizeof(file_header));
+		if (read(output, &file_header, sizeof(file_header)) == -1) {
+			perror("failed to read file headers");
+			exit(-1);
+		}
+
 		lseek(output, file_header.data_size, SEEK_CUR);
 	}
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index de1b97845e9e..5eb5566f0c95 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1400,7 +1400,10 @@ static int __cmd_report(void)
 		exit(0);
 	}
 
-	read(input, &file_header, sizeof(file_header));
+	if (read(input, &file_header, sizeof(file_header)) == -1) {
+		perror("failed to read file headers");
+		exit(-1);
+	}
 
 	if (sort__has_parent &&
 	    !(file_header.sample_type & PERF_SAMPLE_CALLCHAIN)) {