mirror of
https://github.com/torvalds/linux.git
synced 2024-11-23 12:42:02 +00:00
d83525ca62
Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let bpf program serialize access to other variables. Example: struct hash_elem { int cnt; struct bpf_spin_lock lock; }; struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key); if (val) { bpf_spin_lock(&val->lock); val->cnt++; bpf_spin_unlock(&val->lock); } Restrictions and safety checks: - bpf_spin_lock is only allowed inside HASH and ARRAY maps. - BTF description of the map is mandatory for safety analysis. - bpf program can take one bpf_spin_lock at a time, since two or more can cause dead locks. - only one 'struct bpf_spin_lock' is allowed per map element. It drastically simplifies implementation yet allows bpf program to use any number of bpf_spin_locks. - when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed. - bpf program must bpf_spin_unlock() before return. - bpf program can access 'struct bpf_spin_lock' only via bpf_spin_lock()/bpf_spin_unlock() helpers. - load/store into 'struct bpf_spin_lock lock;' field is not allowed. - to use bpf_spin_lock() helper the BTF description of map value must be a struct and have 'struct bpf_spin_lock anyname;' field at the top level. Nested lock inside another struct is not allowed. - syscall map_lookup doesn't copy bpf_spin_lock field to user space. - syscall map_update and program map_update do not update bpf_spin_lock field. - bpf_spin_lock cannot be on the stack or inside networking packet. bpf_spin_lock can only be inside HASH or ARRAY map value. - bpf_spin_lock is available to root only and to all program types. - bpf_spin_lock is not allowed in inner maps of map-in-map. - ld_abs is not allowed inside spin_lock-ed region. - tracing progs and socket filter progs cannot use bpf_spin_lock due to insufficient preemption checks Implementation details: - cgroup-bpf class of programs can nest with xdp/tc programs. Hence bpf_spin_lock is equivalent to spin_lock_irqsave. Other solutions to avoid nested bpf_spin_lock are possible. Like making sure that all networking progs run with softirq disabled. spin_lock_irqsave is the simplest and doesn't add overhead to the programs that don't use it. - arch_spinlock_t is used when its implemented as queued_spin_lock - archs can force their own arch_spinlock_t - on architectures where queued_spin_lock is not available and sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used. - presence of bpf_spin_lock inside map value could have been indicated via extra flag during map_create, but specifying it via BTF is cleaner. It provides introspection for map key/value and reduces user mistakes. Next steps: - allow bpf_spin_lock in other map types (like cgroup local storage) - introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper to request kernel to grab bpf_spin_lock before rewriting the value. That will serialize access to map elements. Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
123 lines
3.1 KiB
C
123 lines
3.1 KiB
C
/* Copyright (c) 2017 Facebook
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of version 2 of the GNU General Public
|
|
* License as published by the Free Software Foundation.
|
|
*/
|
|
#include <linux/slab.h>
|
|
#include <linux/bpf.h>
|
|
|
|
#include "map_in_map.h"
|
|
|
|
struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
|
|
{
|
|
struct bpf_map *inner_map, *inner_map_meta;
|
|
u32 inner_map_meta_size;
|
|
struct fd f;
|
|
|
|
f = fdget(inner_map_ufd);
|
|
inner_map = __bpf_map_get(f);
|
|
if (IS_ERR(inner_map))
|
|
return inner_map;
|
|
|
|
/* prog_array->owner_prog_type and owner_jited
|
|
* is a runtime binding. Doing static check alone
|
|
* in the verifier is not enough.
|
|
*/
|
|
if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
|
|
inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
|
|
inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
|
|
fdput(f);
|
|
return ERR_PTR(-ENOTSUPP);
|
|
}
|
|
|
|
/* Does not support >1 level map-in-map */
|
|
if (inner_map->inner_map_meta) {
|
|
fdput(f);
|
|
return ERR_PTR(-EINVAL);
|
|
}
|
|
|
|
if (map_value_has_spin_lock(inner_map)) {
|
|
fdput(f);
|
|
return ERR_PTR(-ENOTSUPP);
|
|
}
|
|
|
|
inner_map_meta_size = sizeof(*inner_map_meta);
|
|
/* In some cases verifier needs to access beyond just base map. */
|
|
if (inner_map->ops == &array_map_ops)
|
|
inner_map_meta_size = sizeof(struct bpf_array);
|
|
|
|
inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
|
|
if (!inner_map_meta) {
|
|
fdput(f);
|
|
return ERR_PTR(-ENOMEM);
|
|
}
|
|
|
|
inner_map_meta->map_type = inner_map->map_type;
|
|
inner_map_meta->key_size = inner_map->key_size;
|
|
inner_map_meta->value_size = inner_map->value_size;
|
|
inner_map_meta->map_flags = inner_map->map_flags;
|
|
inner_map_meta->max_entries = inner_map->max_entries;
|
|
|
|
/* Misc members not needed in bpf_map_meta_equal() check. */
|
|
inner_map_meta->ops = inner_map->ops;
|
|
if (inner_map->ops == &array_map_ops) {
|
|
inner_map_meta->unpriv_array = inner_map->unpriv_array;
|
|
container_of(inner_map_meta, struct bpf_array, map)->index_mask =
|
|
container_of(inner_map, struct bpf_array, map)->index_mask;
|
|
}
|
|
|
|
fdput(f);
|
|
return inner_map_meta;
|
|
}
|
|
|
|
void bpf_map_meta_free(struct bpf_map *map_meta)
|
|
{
|
|
kfree(map_meta);
|
|
}
|
|
|
|
bool bpf_map_meta_equal(const struct bpf_map *meta0,
|
|
const struct bpf_map *meta1)
|
|
{
|
|
/* No need to compare ops because it is covered by map_type */
|
|
return meta0->map_type == meta1->map_type &&
|
|
meta0->key_size == meta1->key_size &&
|
|
meta0->value_size == meta1->value_size &&
|
|
meta0->map_flags == meta1->map_flags &&
|
|
meta0->max_entries == meta1->max_entries;
|
|
}
|
|
|
|
void *bpf_map_fd_get_ptr(struct bpf_map *map,
|
|
struct file *map_file /* not used */,
|
|
int ufd)
|
|
{
|
|
struct bpf_map *inner_map;
|
|
struct fd f;
|
|
|
|
f = fdget(ufd);
|
|
inner_map = __bpf_map_get(f);
|
|
if (IS_ERR(inner_map))
|
|
return inner_map;
|
|
|
|
if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
|
|
inner_map = bpf_map_inc(inner_map, false);
|
|
else
|
|
inner_map = ERR_PTR(-EINVAL);
|
|
|
|
fdput(f);
|
|
return inner_map;
|
|
}
|
|
|
|
void bpf_map_fd_put_ptr(void *ptr)
|
|
{
|
|
/* ptr->ops->map_free() has to go through one
|
|
* rcu grace period by itself.
|
|
*/
|
|
bpf_map_put(ptr);
|
|
}
|
|
|
|
u32 bpf_map_fd_sys_lookup_elem(void *ptr)
|
|
{
|
|
return ((struct bpf_map *)ptr)->id;
|
|
}
|