bpf: add lookup/update support for per-cpu hash and array maps
The functions bpf_map_lookup_elem(map, key, value) and
bpf_map_update_elem(map, key, value, flags) need to get/set
values from all-cpus for per-cpu hash and array maps,
so that user space can aggregate/update them as necessary.
Example of single counter aggregation in user space:
  unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
  long values[nr_cpus];
  long value = 0;
  bpf_lookup_elem(fd, key, values);
  for (i = 0; i < nr_cpus; i++)
    value += values[i];
The user space must provide round_up(value_size, 8) * nr_cpus
array to get/set values, since kernel will use 'long' copy
of per-cpu values to try to copy good counters atomically.
It's a best-effort, since bpf programs and user space are racing
to access the same memory.
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									a10423b87a
								
							
						
					
					
						commit
						15a07b3381
					
				| @ -183,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog); | ||||
| int bpf_obj_pin_user(u32 ufd, const char __user *pathname); | ||||
| int bpf_obj_get_user(const char __user *pathname); | ||||
| 
 | ||||
| int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); | ||||
| int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); | ||||
| int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, | ||||
| 			   u64 flags); | ||||
| int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, | ||||
| 			    u64 flags); | ||||
| 
 | ||||
| /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
 | ||||
|  * forced to use 'long' read/writes to try to atomically copy long counters. | ||||
|  * Best-effort only.  No barriers here, since it _will_ race with concurrent | ||||
|  * updates from BPF programs. Called from bpf syscall and mostly used with | ||||
|  * size 8 or 16 bytes, so ask compiler to inline it. | ||||
|  */ | ||||
| static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) | ||||
| { | ||||
| 	const long *lsrc = src; | ||||
| 	long *ldst = dst; | ||||
| 
 | ||||
| 	size /= sizeof(long); | ||||
| 	while (size--) | ||||
| 		*ldst++ = *lsrc++; | ||||
| } | ||||
| 
 | ||||
| /* verify correctness of eBPF program */ | ||||
| int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); | ||||
| #else | ||||
|  | ||||
| @ -130,6 +130,32 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) | ||||
| 	return this_cpu_ptr(array->pptrs[index]); | ||||
| } | ||||
| 
 | ||||
| int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) | ||||
| { | ||||
| 	struct bpf_array *array = container_of(map, struct bpf_array, map); | ||||
| 	u32 index = *(u32 *)key; | ||||
| 	void __percpu *pptr; | ||||
| 	int cpu, off = 0; | ||||
| 	u32 size; | ||||
| 
 | ||||
| 	if (unlikely(index >= array->map.max_entries)) | ||||
| 		return -ENOENT; | ||||
| 
 | ||||
| 	/* per_cpu areas are zero-filled and bpf programs can only
 | ||||
| 	 * access 'value_size' of them, so copying rounded areas | ||||
| 	 * will not leak any kernel data | ||||
| 	 */ | ||||
| 	size = round_up(map->value_size, 8); | ||||
| 	rcu_read_lock(); | ||||
| 	pptr = array->pptrs[index]; | ||||
| 	for_each_possible_cpu(cpu) { | ||||
| 		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); | ||||
| 		off += size; | ||||
| 	} | ||||
| 	rcu_read_unlock(); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /* Called from syscall */ | ||||
| static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | ||||
| { | ||||
| @ -177,6 +203,44 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, | ||||
| 			    u64 map_flags) | ||||
| { | ||||
| 	struct bpf_array *array = container_of(map, struct bpf_array, map); | ||||
| 	u32 index = *(u32 *)key; | ||||
| 	void __percpu *pptr; | ||||
| 	int cpu, off = 0; | ||||
| 	u32 size; | ||||
| 
 | ||||
| 	if (unlikely(map_flags > BPF_EXIST)) | ||||
| 		/* unknown flags */ | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	if (unlikely(index >= array->map.max_entries)) | ||||
| 		/* all elements were pre-allocated, cannot insert a new one */ | ||||
| 		return -E2BIG; | ||||
| 
 | ||||
| 	if (unlikely(map_flags == BPF_NOEXIST)) | ||||
| 		/* all elements already exist */ | ||||
| 		return -EEXIST; | ||||
| 
 | ||||
| 	/* the user space will provide round_up(value_size, 8) bytes that
 | ||||
| 	 * will be copied into per-cpu area. bpf programs can only access | ||||
| 	 * value_size of it. During lookup the same extra bytes will be | ||||
| 	 * returned or zeros which were zero-filled by percpu_alloc, | ||||
| 	 * so no kernel data leaks possible | ||||
| 	 */ | ||||
| 	size = round_up(map->value_size, 8); | ||||
| 	rcu_read_lock(); | ||||
| 	pptr = array->pptrs[index]; | ||||
| 	for_each_possible_cpu(cpu) { | ||||
| 		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); | ||||
| 		off += size; | ||||
| 	} | ||||
| 	rcu_read_unlock(); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /* Called from syscall or from eBPF program */ | ||||
| static int array_map_delete_elem(struct bpf_map *map, void *key) | ||||
| { | ||||
|  | ||||
| @ -290,7 +290,7 @@ static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) | ||||
| 
 | ||||
| static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | ||||
| 					 void *value, u32 key_size, u32 hash, | ||||
| 					 bool percpu) | ||||
| 					 bool percpu, bool onallcpus) | ||||
| { | ||||
| 	u32 size = htab->map.value_size; | ||||
| 	struct htab_elem *l_new; | ||||
| @ -312,8 +312,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | ||||
| 			return NULL; | ||||
| 		} | ||||
| 
 | ||||
| 		/* copy true value_size bytes */ | ||||
| 		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); | ||||
| 		if (!onallcpus) { | ||||
| 			/* copy true value_size bytes */ | ||||
| 			memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); | ||||
| 		} else { | ||||
| 			int off = 0, cpu; | ||||
| 
 | ||||
| 			for_each_possible_cpu(cpu) { | ||||
| 				bpf_long_memcpy(per_cpu_ptr(pptr, cpu), | ||||
| 						value + off, size); | ||||
| 				off += size; | ||||
| 			} | ||||
| 		} | ||||
| 		htab_elem_set_ptr(l_new, key_size, pptr); | ||||
| 	} else { | ||||
| 		memcpy(l_new->key + round_up(key_size, 8), value, size); | ||||
| @ -368,7 +378,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, | ||||
| 	/* allocate new element outside of the lock, since
 | ||||
| 	 * we're most likley going to insert it | ||||
| 	 */ | ||||
| 	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false); | ||||
| 	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); | ||||
| 	if (!l_new) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| @ -402,8 +412,9 @@ err: | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, | ||||
| 				       void *value, u64 map_flags) | ||||
| static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, | ||||
| 					 void *value, u64 map_flags, | ||||
| 					 bool onallcpus) | ||||
| { | ||||
| 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||||
| 	struct htab_elem *l_new = NULL, *l_old; | ||||
| @ -436,12 +447,25 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, | ||||
| 		goto err; | ||||
| 
 | ||||
| 	if (l_old) { | ||||
| 		void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); | ||||
| 		u32 size = htab->map.value_size; | ||||
| 
 | ||||
| 		/* per-cpu hash map can update value in-place */ | ||||
| 		memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)), | ||||
| 		       value, htab->map.value_size); | ||||
| 		if (!onallcpus) { | ||||
| 			memcpy(this_cpu_ptr(pptr), value, size); | ||||
| 		} else { | ||||
| 			int off = 0, cpu; | ||||
| 
 | ||||
| 			size = round_up(size, 8); | ||||
| 			for_each_possible_cpu(cpu) { | ||||
| 				bpf_long_memcpy(per_cpu_ptr(pptr, cpu), | ||||
| 						value + off, size); | ||||
| 				off += size; | ||||
| 			} | ||||
| 		} | ||||
| 	} else { | ||||
| 		l_new = alloc_htab_elem(htab, key, value, key_size, | ||||
| 					hash, true); | ||||
| 					hash, true, onallcpus); | ||||
| 		if (!l_new) { | ||||
| 			ret = -ENOMEM; | ||||
| 			goto err; | ||||
| @ -455,6 +479,12 @@ err: | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, | ||||
| 				       void *value, u64 map_flags) | ||||
| { | ||||
| 	return __htab_percpu_map_update_elem(map, key, value, map_flags, false); | ||||
| } | ||||
| 
 | ||||
| /* Called from syscall or from eBPF program */ | ||||
| static int htab_map_delete_elem(struct bpf_map *map, void *key) | ||||
| { | ||||
| @ -557,6 +587,41 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) | ||||
| 		return NULL; | ||||
| } | ||||
| 
 | ||||
| int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) | ||||
| { | ||||
| 	struct htab_elem *l; | ||||
| 	void __percpu *pptr; | ||||
| 	int ret = -ENOENT; | ||||
| 	int cpu, off = 0; | ||||
| 	u32 size; | ||||
| 
 | ||||
| 	/* per_cpu areas are zero-filled and bpf programs can only
 | ||||
| 	 * access 'value_size' of them, so copying rounded areas | ||||
| 	 * will not leak any kernel data | ||||
| 	 */ | ||||
| 	size = round_up(map->value_size, 8); | ||||
| 	rcu_read_lock(); | ||||
| 	l = __htab_map_lookup_elem(map, key); | ||||
| 	if (!l) | ||||
| 		goto out; | ||||
| 	pptr = htab_elem_get_ptr(l, map->key_size); | ||||
| 	for_each_possible_cpu(cpu) { | ||||
| 		bpf_long_memcpy(value + off, | ||||
| 				per_cpu_ptr(pptr, cpu), size); | ||||
| 		off += size; | ||||
| 	} | ||||
| 	ret = 0; | ||||
| out: | ||||
| 	rcu_read_unlock(); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, | ||||
| 			   u64 map_flags) | ||||
| { | ||||
| 	return __htab_percpu_map_update_elem(map, key, value, map_flags, true); | ||||
| } | ||||
| 
 | ||||
| static const struct bpf_map_ops htab_percpu_ops = { | ||||
| 	.map_alloc = htab_map_alloc, | ||||
| 	.map_free = htab_map_free, | ||||
|  | ||||
| @ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr) | ||||
| 	int ufd = attr->map_fd; | ||||
| 	struct bpf_map *map; | ||||
| 	void *key, *value, *ptr; | ||||
| 	u32 value_size; | ||||
| 	struct fd f; | ||||
| 	int err; | ||||
| 
 | ||||
| @ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr) | ||||
| 	if (copy_from_user(key, ukey, map->key_size) != 0) | ||||
| 		goto free_key; | ||||
| 
 | ||||
| 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || | ||||
| 	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) | ||||
| 		value_size = round_up(map->value_size, 8) * num_possible_cpus(); | ||||
| 	else | ||||
| 		value_size = map->value_size; | ||||
| 
 | ||||
| 	err = -ENOMEM; | ||||
| 	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); | ||||
| 	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); | ||||
| 	if (!value) | ||||
| 		goto free_key; | ||||
| 
 | ||||
| 	rcu_read_lock(); | ||||
| 	ptr = map->ops->map_lookup_elem(map, key); | ||||
| 	if (ptr) | ||||
| 		memcpy(value, ptr, map->value_size); | ||||
| 	rcu_read_unlock(); | ||||
| 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { | ||||
| 		err = bpf_percpu_hash_copy(map, key, value); | ||||
| 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { | ||||
| 		err = bpf_percpu_array_copy(map, key, value); | ||||
| 	} else { | ||||
| 		rcu_read_lock(); | ||||
| 		ptr = map->ops->map_lookup_elem(map, key); | ||||
| 		if (ptr) | ||||
| 			memcpy(value, ptr, value_size); | ||||
| 		rcu_read_unlock(); | ||||
| 		err = ptr ? 0 : -ENOENT; | ||||
| 	} | ||||
| 
 | ||||
| 	err = -ENOENT; | ||||
| 	if (!ptr) | ||||
| 	if (err) | ||||
| 		goto free_value; | ||||
| 
 | ||||
| 	err = -EFAULT; | ||||
| 	if (copy_to_user(uvalue, value, map->value_size) != 0) | ||||
| 	if (copy_to_user(uvalue, value, value_size) != 0) | ||||
| 		goto free_value; | ||||
| 
 | ||||
| 	err = 0; | ||||
| @ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr) | ||||
| 	int ufd = attr->map_fd; | ||||
| 	struct bpf_map *map; | ||||
| 	void *key, *value; | ||||
| 	u32 value_size; | ||||
| 	struct fd f; | ||||
| 	int err; | ||||
| 
 | ||||
| @ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr) | ||||
| 	if (copy_from_user(key, ukey, map->key_size) != 0) | ||||
| 		goto free_key; | ||||
| 
 | ||||
| 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || | ||||
| 	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) | ||||
| 		value_size = round_up(map->value_size, 8) * num_possible_cpus(); | ||||
| 	else | ||||
| 		value_size = map->value_size; | ||||
| 
 | ||||
| 	err = -ENOMEM; | ||||
| 	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); | ||||
| 	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); | ||||
| 	if (!value) | ||||
| 		goto free_key; | ||||
| 
 | ||||
| 	err = -EFAULT; | ||||
| 	if (copy_from_user(value, uvalue, map->value_size) != 0) | ||||
| 	if (copy_from_user(value, uvalue, value_size) != 0) | ||||
| 		goto free_value; | ||||
| 
 | ||||
| 	/* eBPF program that use maps are running under rcu_read_lock(),
 | ||||
| 	 * therefore all map accessors rely on this fact, so do the same here | ||||
| 	 */ | ||||
| 	rcu_read_lock(); | ||||
| 	err = map->ops->map_update_elem(map, key, value, attr->flags); | ||||
| 	rcu_read_unlock(); | ||||
| 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { | ||||
| 		err = bpf_percpu_hash_update(map, key, value, attr->flags); | ||||
| 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { | ||||
| 		err = bpf_percpu_array_update(map, key, value, attr->flags); | ||||
| 	} else { | ||||
| 		rcu_read_lock(); | ||||
| 		err = map->ops->map_update_elem(map, key, value, attr->flags); | ||||
| 		rcu_read_unlock(); | ||||
| 	} | ||||
| 
 | ||||
| free_value: | ||||
| 	kfree(value); | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user