samples/bpf: add map_lookup microbenchmark
$ map_perf_test 128
speed of HASH bpf_map_lookup_elem() in lookups per second
	w/o JIT		w/JIT
before	46M		58M
after	42M		74M
perf report
before:
    54.23%  map_perf_test  [kernel.kallsyms]  [k] __htab_map_lookup_elem
    14.24%  map_perf_test  [kernel.kallsyms]  [k] lookup_elem_raw
     8.84%  map_perf_test  [kernel.kallsyms]  [k] htab_map_lookup_elem
     5.93%  map_perf_test  [kernel.kallsyms]  [k] bpf_map_lookup_elem
     2.30%  map_perf_test  [kernel.kallsyms]  [k] bpf_prog_da4fc6a3f41761a2
     1.49%  map_perf_test  [kernel.kallsyms]  [k] kprobe_ftrace_handler
after:
    60.03%  map_perf_test  [kernel.kallsyms]  [k] __htab_map_lookup_elem
    18.07%  map_perf_test  [kernel.kallsyms]  [k] lookup_elem_raw
     2.91%  map_perf_test  [kernel.kallsyms]  [k] bpf_prog_da4fc6a3f41761a2
     1.94%  map_perf_test  [kernel.kallsyms]  [k] _einittext
     1.90%  map_perf_test  [kernel.kallsyms]  [k] __audit_syscall_exit
     1.72%  map_perf_test  [kernel.kallsyms]  [k] kprobe_ftrace_handler
Notice that bpf_map_lookup_elem() and htab_map_lookup_elem() are trivial
functions, yet they take sizeable amount of cpu time.
htab_map_gen_lookup() removes bpf_map_lookup_elem() and converts
htab_map_lookup_elem() into three BPF insns which causing cpu time
for bpf_prog_da4fc6a3f41761a2() slightly increase.
$ map_perf_test 256
speed of ARRAY bpf_map_lookup_elem() in lookups per second
	w/o JIT		w/JIT
before	97M		174M
after	64M		280M
before:
    37.33%  map_perf_test  [kernel.kallsyms]  [k] array_map_lookup_elem
    13.95%  map_perf_test  [kernel.kallsyms]  [k] bpf_map_lookup_elem
     6.54%  map_perf_test  [kernel.kallsyms]  [k] bpf_prog_da4fc6a3f41761a2
     4.57%  map_perf_test  [kernel.kallsyms]  [k] kprobe_ftrace_handler
after:
    32.86%  map_perf_test  [kernel.kallsyms]  [k] bpf_prog_da4fc6a3f41761a2
     6.54%  map_perf_test  [kernel.kallsyms]  [k] kprobe_ftrace_handler
array_map_gen_lookup() removes calls to array_map_lookup_elem()
and bpf_map_lookup_elem() and replaces them with 7 bpf insns.
The performance without JIT is slower, since executing extra insns
in the interpreter is slower than running native C code,
but with JIT the performance gains are obvious,
since native C->x86 code is replaced with fewer bpf->x86 instructions.
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									9015d2f595
								
							
						
					
					
						commit
						95ff141e52
					
				| @ -65,6 +65,13 @@ struct bpf_map_def SEC("maps") lpm_trie_map_alloc = { | ||||
| 	.map_flags = BPF_F_NO_PREALLOC, | ||||
| }; | ||||
| 
 | ||||
| struct bpf_map_def SEC("maps") array_map = { | ||||
| 	.type = BPF_MAP_TYPE_ARRAY, | ||||
| 	.key_size = sizeof(u32), | ||||
| 	.value_size = sizeof(long), | ||||
| 	.max_entries = MAX_ENTRIES, | ||||
| }; | ||||
| 
 | ||||
| SEC("kprobe/sys_getuid") | ||||
| int stress_hmap(struct pt_regs *ctx) | ||||
| { | ||||
| @ -165,5 +172,31 @@ int stress_lpm_trie_map_alloc(struct pt_regs *ctx) | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| SEC("kprobe/sys_getpgid") | ||||
| int stress_hash_map_lookup(struct pt_regs *ctx) | ||||
| { | ||||
| 	u32 key = 1, i; | ||||
| 	long *value; | ||||
| 
 | ||||
| #pragma clang loop unroll(full) | ||||
| 	for (i = 0; i < 64; ++i) | ||||
| 		value = bpf_map_lookup_elem(&hash_map, &key); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| SEC("kprobe/sys_getpgrp") | ||||
| int stress_array_map_lookup(struct pt_regs *ctx) | ||||
| { | ||||
| 	u32 key = 1, i; | ||||
| 	long *value; | ||||
| 
 | ||||
| #pragma clang loop unroll(full) | ||||
| 	for (i = 0; i < 64; ++i) | ||||
| 		value = bpf_map_lookup_elem(&array_map, &key); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| char _license[] SEC("license") = "GPL"; | ||||
| u32 _version SEC("version") = LINUX_VERSION_CODE; | ||||
|  | ||||
| @ -38,6 +38,8 @@ static __u64 time_get_ns(void) | ||||
| #define LRU_HASH_PREALLOC	(1 << 4) | ||||
| #define PERCPU_LRU_HASH_PREALLOC	(1 << 5) | ||||
| #define LPM_KMALLOC		(1 << 6) | ||||
| #define HASH_LOOKUP		(1 << 7) | ||||
| #define ARRAY_LOOKUP		(1 << 8) | ||||
| 
 | ||||
| static int test_flags = ~0; | ||||
| 
 | ||||
| @ -125,6 +127,30 @@ static void test_lpm_kmalloc(int cpu) | ||||
| 	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); | ||||
| } | ||||
| 
 | ||||
| static void test_hash_lookup(int cpu) | ||||
| { | ||||
| 	__u64 start_time; | ||||
| 	int i; | ||||
| 
 | ||||
| 	start_time = time_get_ns(); | ||||
| 	for (i = 0; i < MAX_CNT; i++) | ||||
| 		syscall(__NR_getpgid, 0); | ||||
| 	printf("%d:hash_lookup %lld lookups per sec\n", | ||||
| 	       cpu, MAX_CNT * 1000000000ll * 64 / (time_get_ns() - start_time)); | ||||
| } | ||||
| 
 | ||||
| static void test_array_lookup(int cpu) | ||||
| { | ||||
| 	__u64 start_time; | ||||
| 	int i; | ||||
| 
 | ||||
| 	start_time = time_get_ns(); | ||||
| 	for (i = 0; i < MAX_CNT; i++) | ||||
| 		syscall(__NR_getpgrp, 0); | ||||
| 	printf("%d:array_lookup %lld lookups per sec\n", | ||||
| 	       cpu, MAX_CNT * 1000000000ll * 64 / (time_get_ns() - start_time)); | ||||
| } | ||||
| 
 | ||||
| static void loop(int cpu) | ||||
| { | ||||
| 	cpu_set_t cpuset; | ||||
| @ -153,6 +179,12 @@ static void loop(int cpu) | ||||
| 
 | ||||
| 	if (test_flags & LPM_KMALLOC) | ||||
| 		test_lpm_kmalloc(cpu); | ||||
| 
 | ||||
| 	if (test_flags & HASH_LOOKUP) | ||||
| 		test_hash_lookup(cpu); | ||||
| 
 | ||||
| 	if (test_flags & ARRAY_LOOKUP) | ||||
| 		test_array_lookup(cpu); | ||||
| } | ||||
| 
 | ||||
| static void run_perf_test(int tasks) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user