Merge branch 'linus' into perf/urgent, to synchronize with upstream
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
@@ -101,7 +101,7 @@ config UNINLINE_SPIN_UNLOCK
|
||||
# unlock and unlock_irq functions are inlined when:
|
||||
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
|
||||
# or
|
||||
# - DEBUG_SPINLOCK=n and PREEMPT=n
|
||||
# - DEBUG_SPINLOCK=n and PREEMPTION=n
|
||||
#
|
||||
# unlock_bh and unlock_irqrestore functions are inlined when:
|
||||
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
|
||||
@@ -139,7 +139,7 @@ config INLINE_SPIN_UNLOCK_BH
|
||||
|
||||
config INLINE_SPIN_UNLOCK_IRQ
|
||||
def_bool y
|
||||
depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
|
||||
depends on !PREEMPTION || ARCH_INLINE_SPIN_UNLOCK_IRQ
|
||||
|
||||
config INLINE_SPIN_UNLOCK_IRQRESTORE
|
||||
def_bool y
|
||||
@@ -168,7 +168,7 @@ config INLINE_READ_LOCK_IRQSAVE
|
||||
|
||||
config INLINE_READ_UNLOCK
|
||||
def_bool y
|
||||
depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
|
||||
depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK
|
||||
|
||||
config INLINE_READ_UNLOCK_BH
|
||||
def_bool y
|
||||
@@ -176,7 +176,7 @@ config INLINE_READ_UNLOCK_BH
|
||||
|
||||
config INLINE_READ_UNLOCK_IRQ
|
||||
def_bool y
|
||||
depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
|
||||
depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK_IRQ
|
||||
|
||||
config INLINE_READ_UNLOCK_IRQRESTORE
|
||||
def_bool y
|
||||
@@ -205,7 +205,7 @@ config INLINE_WRITE_LOCK_IRQSAVE
|
||||
|
||||
config INLINE_WRITE_UNLOCK
|
||||
def_bool y
|
||||
depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
|
||||
depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK
|
||||
|
||||
config INLINE_WRITE_UNLOCK_BH
|
||||
def_bool y
|
||||
@@ -213,7 +213,7 @@ config INLINE_WRITE_UNLOCK_BH
|
||||
|
||||
config INLINE_WRITE_UNLOCK_IRQ
|
||||
def_bool y
|
||||
depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
|
||||
depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK_IRQ
|
||||
|
||||
config INLINE_WRITE_UNLOCK_IRQRESTORE
|
||||
def_bool y
|
||||
|
||||
@@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
|
||||
obj-$(CONFIG_BPF_JIT) += trampoline.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += btf.o
|
||||
obj-$(CONFIG_BPF_JIT) += dispatcher.o
|
||||
ifeq ($(CONFIG_NET),y)
|
||||
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
|
||||
@@ -26,3 +27,6 @@ endif
|
||||
ifeq ($(CONFIG_SYSFS),y)
|
||||
obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
|
||||
endif
|
||||
ifeq ($(CONFIG_BPF_JIT),y)
|
||||
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
|
||||
endif
|
||||
|
||||
@@ -503,6 +503,8 @@ const struct bpf_map_ops array_map_ops = {
|
||||
.map_mmap = array_map_mmap,
|
||||
.map_seq_show_elem = array_map_seq_show_elem,
|
||||
.map_check_btf = array_map_check_btf,
|
||||
.map_lookup_batch = generic_map_lookup_batch,
|
||||
.map_update_batch = generic_map_update_batch,
|
||||
};
|
||||
|
||||
const struct bpf_map_ops percpu_array_map_ops = {
|
||||
|
||||
634
kernel/bpf/bpf_struct_ops.c
Normal file
634
kernel/bpf/bpf_struct_ops.c
Normal file
@@ -0,0 +1,634 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/* Copyright (c) 2019 Facebook */
|
||||
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/bpf_verifier.h>
|
||||
#include <linux/btf.h>
|
||||
#include <linux/filter.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/numa.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/refcount.h>
|
||||
#include <linux/mutex.h>
|
||||
|
||||
enum bpf_struct_ops_state {
|
||||
BPF_STRUCT_OPS_STATE_INIT,
|
||||
BPF_STRUCT_OPS_STATE_INUSE,
|
||||
BPF_STRUCT_OPS_STATE_TOBEFREE,
|
||||
};
|
||||
|
||||
#define BPF_STRUCT_OPS_COMMON_VALUE \
|
||||
refcount_t refcnt; \
|
||||
enum bpf_struct_ops_state state
|
||||
|
||||
struct bpf_struct_ops_value {
|
||||
BPF_STRUCT_OPS_COMMON_VALUE;
|
||||
char data[0] ____cacheline_aligned_in_smp;
|
||||
};
|
||||
|
||||
struct bpf_struct_ops_map {
|
||||
struct bpf_map map;
|
||||
const struct bpf_struct_ops *st_ops;
|
||||
/* protect map_update */
|
||||
struct mutex lock;
|
||||
/* progs has all the bpf_prog that is populated
|
||||
* to the func ptr of the kernel's struct
|
||||
* (in kvalue.data).
|
||||
*/
|
||||
struct bpf_prog **progs;
|
||||
/* image is a page that has all the trampolines
|
||||
* that stores the func args before calling the bpf_prog.
|
||||
* A PAGE_SIZE "image" is enough to store all trampoline for
|
||||
* "progs[]".
|
||||
*/
|
||||
void *image;
|
||||
/* uvalue->data stores the kernel struct
|
||||
* (e.g. tcp_congestion_ops) that is more useful
|
||||
* to userspace than the kvalue. For example,
|
||||
* the bpf_prog's id is stored instead of the kernel
|
||||
* address of a func ptr.
|
||||
*/
|
||||
struct bpf_struct_ops_value *uvalue;
|
||||
/* kvalue.data stores the actual kernel's struct
|
||||
* (e.g. tcp_congestion_ops) that will be
|
||||
* registered to the kernel subsystem.
|
||||
*/
|
||||
struct bpf_struct_ops_value kvalue;
|
||||
};
|
||||
|
||||
#define VALUE_PREFIX "bpf_struct_ops_"
|
||||
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
|
||||
|
||||
/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
|
||||
* the map's value exposed to the userspace and its btf-type-id is
|
||||
* stored at the map->btf_vmlinux_value_type_id.
|
||||
*
|
||||
*/
|
||||
#define BPF_STRUCT_OPS_TYPE(_name) \
|
||||
extern struct bpf_struct_ops bpf_##_name; \
|
||||
\
|
||||
struct bpf_struct_ops_##_name { \
|
||||
BPF_STRUCT_OPS_COMMON_VALUE; \
|
||||
struct _name data ____cacheline_aligned_in_smp; \
|
||||
};
|
||||
#include "bpf_struct_ops_types.h"
|
||||
#undef BPF_STRUCT_OPS_TYPE
|
||||
|
||||
enum {
|
||||
#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
|
||||
#include "bpf_struct_ops_types.h"
|
||||
#undef BPF_STRUCT_OPS_TYPE
|
||||
__NR_BPF_STRUCT_OPS_TYPE,
|
||||
};
|
||||
|
||||
static struct bpf_struct_ops * const bpf_struct_ops[] = {
|
||||
#define BPF_STRUCT_OPS_TYPE(_name) \
|
||||
[BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
|
||||
#include "bpf_struct_ops_types.h"
|
||||
#undef BPF_STRUCT_OPS_TYPE
|
||||
};
|
||||
|
||||
const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
|
||||
};
|
||||
|
||||
const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
|
||||
};
|
||||
|
||||
static const struct btf_type *module_type;
|
||||
|
||||
void bpf_struct_ops_init(struct btf *btf)
|
||||
{
|
||||
s32 type_id, value_id, module_id;
|
||||
const struct btf_member *member;
|
||||
struct bpf_struct_ops *st_ops;
|
||||
struct bpf_verifier_log log = {};
|
||||
const struct btf_type *t;
|
||||
char value_name[128];
|
||||
const char *mname;
|
||||
u32 i, j;
|
||||
|
||||
/* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
|
||||
#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
|
||||
#include "bpf_struct_ops_types.h"
|
||||
#undef BPF_STRUCT_OPS_TYPE
|
||||
|
||||
module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
|
||||
if (module_id < 0) {
|
||||
pr_warn("Cannot find struct module in btf_vmlinux\n");
|
||||
return;
|
||||
}
|
||||
module_type = btf_type_by_id(btf, module_id);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
|
||||
st_ops = bpf_struct_ops[i];
|
||||
|
||||
if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
|
||||
sizeof(value_name)) {
|
||||
pr_warn("struct_ops name %s is too long\n",
|
||||
st_ops->name);
|
||||
continue;
|
||||
}
|
||||
sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
|
||||
|
||||
value_id = btf_find_by_name_kind(btf, value_name,
|
||||
BTF_KIND_STRUCT);
|
||||
if (value_id < 0) {
|
||||
pr_warn("Cannot find struct %s in btf_vmlinux\n",
|
||||
value_name);
|
||||
continue;
|
||||
}
|
||||
|
||||
type_id = btf_find_by_name_kind(btf, st_ops->name,
|
||||
BTF_KIND_STRUCT);
|
||||
if (type_id < 0) {
|
||||
pr_warn("Cannot find struct %s in btf_vmlinux\n",
|
||||
st_ops->name);
|
||||
continue;
|
||||
}
|
||||
t = btf_type_by_id(btf, type_id);
|
||||
if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
|
||||
pr_warn("Cannot support #%u members in struct %s\n",
|
||||
btf_type_vlen(t), st_ops->name);
|
||||
continue;
|
||||
}
|
||||
|
||||
for_each_member(j, t, member) {
|
||||
const struct btf_type *func_proto;
|
||||
|
||||
mname = btf_name_by_offset(btf, member->name_off);
|
||||
if (!*mname) {
|
||||
pr_warn("anon member in struct %s is not supported\n",
|
||||
st_ops->name);
|
||||
break;
|
||||
}
|
||||
|
||||
if (btf_member_bitfield_size(t, member)) {
|
||||
pr_warn("bit field member %s in struct %s is not supported\n",
|
||||
mname, st_ops->name);
|
||||
break;
|
||||
}
|
||||
|
||||
func_proto = btf_type_resolve_func_ptr(btf,
|
||||
member->type,
|
||||
NULL);
|
||||
if (func_proto &&
|
||||
btf_distill_func_proto(&log, btf,
|
||||
func_proto, mname,
|
||||
&st_ops->func_models[j])) {
|
||||
pr_warn("Error in parsing func ptr %s in struct %s\n",
|
||||
mname, st_ops->name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (j == btf_type_vlen(t)) {
|
||||
if (st_ops->init(btf)) {
|
||||
pr_warn("Error in init bpf_struct_ops %s\n",
|
||||
st_ops->name);
|
||||
} else {
|
||||
st_ops->type_id = type_id;
|
||||
st_ops->type = t;
|
||||
st_ops->value_id = value_id;
|
||||
st_ops->value_type = btf_type_by_id(btf,
|
||||
value_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern struct btf *btf_vmlinux;
|
||||
|
||||
static const struct bpf_struct_ops *
|
||||
bpf_struct_ops_find_value(u32 value_id)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
if (!value_id || !btf_vmlinux)
|
||||
return NULL;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
|
||||
if (bpf_struct_ops[i]->value_id == value_id)
|
||||
return bpf_struct_ops[i];
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
if (!type_id || !btf_vmlinux)
|
||||
return NULL;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
|
||||
if (bpf_struct_ops[i]->type_id == type_id)
|
||||
return bpf_struct_ops[i];
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
|
||||
void *next_key)
|
||||
{
|
||||
if (key && *(u32 *)key == 0)
|
||||
return -ENOENT;
|
||||
|
||||
*(u32 *)next_key = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
|
||||
void *value)
|
||||
{
|
||||
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
|
||||
struct bpf_struct_ops_value *uvalue, *kvalue;
|
||||
enum bpf_struct_ops_state state;
|
||||
|
||||
if (unlikely(*(u32 *)key != 0))
|
||||
return -ENOENT;
|
||||
|
||||
kvalue = &st_map->kvalue;
|
||||
/* Pair with smp_store_release() during map_update */
|
||||
state = smp_load_acquire(&kvalue->state);
|
||||
if (state == BPF_STRUCT_OPS_STATE_INIT) {
|
||||
memset(value, 0, map->value_size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* No lock is needed. state and refcnt do not need
|
||||
* to be updated together under atomic context.
|
||||
*/
|
||||
uvalue = (struct bpf_struct_ops_value *)value;
|
||||
memcpy(uvalue, st_map->uvalue, map->value_size);
|
||||
uvalue->state = state;
|
||||
refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
|
||||
{
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
|
||||
{
|
||||
const struct btf_type *t = st_map->st_ops->type;
|
||||
u32 i;
|
||||
|
||||
for (i = 0; i < btf_type_vlen(t); i++) {
|
||||
if (st_map->progs[i]) {
|
||||
bpf_prog_put(st_map->progs[i]);
|
||||
st_map->progs[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int check_zero_holes(const struct btf_type *t, void *data)
|
||||
{
|
||||
const struct btf_member *member;
|
||||
u32 i, moff, msize, prev_mend = 0;
|
||||
const struct btf_type *mtype;
|
||||
|
||||
for_each_member(i, t, member) {
|
||||
moff = btf_member_bit_offset(t, member) / 8;
|
||||
if (moff > prev_mend &&
|
||||
memchr_inv(data + prev_mend, 0, moff - prev_mend))
|
||||
return -EINVAL;
|
||||
|
||||
mtype = btf_type_by_id(btf_vmlinux, member->type);
|
||||
mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
|
||||
NULL, NULL);
|
||||
if (IS_ERR(mtype))
|
||||
return PTR_ERR(mtype);
|
||||
prev_mend = moff + msize;
|
||||
}
|
||||
|
||||
if (t->size > prev_mend &&
|
||||
memchr_inv(data + prev_mend, 0, t->size - prev_mend))
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
|
||||
void *value, u64 flags)
|
||||
{
|
||||
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
|
||||
const struct bpf_struct_ops *st_ops = st_map->st_ops;
|
||||
struct bpf_struct_ops_value *uvalue, *kvalue;
|
||||
const struct btf_member *member;
|
||||
const struct btf_type *t = st_ops->type;
|
||||
void *udata, *kdata;
|
||||
int prog_fd, err = 0;
|
||||
void *image;
|
||||
u32 i;
|
||||
|
||||
if (flags)
|
||||
return -EINVAL;
|
||||
|
||||
if (*(u32 *)key != 0)
|
||||
return -E2BIG;
|
||||
|
||||
err = check_zero_holes(st_ops->value_type, value);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
uvalue = (struct bpf_struct_ops_value *)value;
|
||||
err = check_zero_holes(t, uvalue->data);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (uvalue->state || refcount_read(&uvalue->refcnt))
|
||||
return -EINVAL;
|
||||
|
||||
uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
|
||||
kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
|
||||
|
||||
mutex_lock(&st_map->lock);
|
||||
|
||||
if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
|
||||
err = -EBUSY;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
memcpy(uvalue, value, map->value_size);
|
||||
|
||||
udata = &uvalue->data;
|
||||
kdata = &kvalue->data;
|
||||
image = st_map->image;
|
||||
|
||||
for_each_member(i, t, member) {
|
||||
const struct btf_type *mtype, *ptype;
|
||||
struct bpf_prog *prog;
|
||||
u32 moff;
|
||||
|
||||
moff = btf_member_bit_offset(t, member) / 8;
|
||||
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
|
||||
if (ptype == module_type) {
|
||||
if (*(void **)(udata + moff))
|
||||
goto reset_unlock;
|
||||
*(void **)(kdata + moff) = BPF_MODULE_OWNER;
|
||||
continue;
|
||||
}
|
||||
|
||||
err = st_ops->init_member(t, member, kdata, udata);
|
||||
if (err < 0)
|
||||
goto reset_unlock;
|
||||
|
||||
/* The ->init_member() has handled this member */
|
||||
if (err > 0)
|
||||
continue;
|
||||
|
||||
/* If st_ops->init_member does not handle it,
|
||||
* we will only handle func ptrs and zero-ed members
|
||||
* here. Reject everything else.
|
||||
*/
|
||||
|
||||
/* All non func ptr member must be 0 */
|
||||
if (!ptype || !btf_type_is_func_proto(ptype)) {
|
||||
u32 msize;
|
||||
|
||||
mtype = btf_type_by_id(btf_vmlinux, member->type);
|
||||
mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
|
||||
NULL, NULL);
|
||||
if (IS_ERR(mtype)) {
|
||||
err = PTR_ERR(mtype);
|
||||
goto reset_unlock;
|
||||
}
|
||||
|
||||
if (memchr_inv(udata + moff, 0, msize)) {
|
||||
err = -EINVAL;
|
||||
goto reset_unlock;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
prog_fd = (int)(*(unsigned long *)(udata + moff));
|
||||
/* Similar check as the attr->attach_prog_fd */
|
||||
if (!prog_fd)
|
||||
continue;
|
||||
|
||||
prog = bpf_prog_get(prog_fd);
|
||||
if (IS_ERR(prog)) {
|
||||
err = PTR_ERR(prog);
|
||||
goto reset_unlock;
|
||||
}
|
||||
st_map->progs[i] = prog;
|
||||
|
||||
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
|
||||
prog->aux->attach_btf_id != st_ops->type_id ||
|
||||
prog->expected_attach_type != i) {
|
||||
err = -EINVAL;
|
||||
goto reset_unlock;
|
||||
}
|
||||
|
||||
err = arch_prepare_bpf_trampoline(image,
|
||||
st_map->image + PAGE_SIZE,
|
||||
&st_ops->func_models[i], 0,
|
||||
&prog, 1, NULL, 0, NULL);
|
||||
if (err < 0)
|
||||
goto reset_unlock;
|
||||
|
||||
*(void **)(kdata + moff) = image;
|
||||
image += err;
|
||||
|
||||
/* put prog_id to udata */
|
||||
*(unsigned long *)(udata + moff) = prog->aux->id;
|
||||
}
|
||||
|
||||
refcount_set(&kvalue->refcnt, 1);
|
||||
bpf_map_inc(map);
|
||||
|
||||
set_memory_ro((long)st_map->image, 1);
|
||||
set_memory_x((long)st_map->image, 1);
|
||||
err = st_ops->reg(kdata);
|
||||
if (likely(!err)) {
|
||||
/* Pair with smp_load_acquire() during lookup_elem().
|
||||
* It ensures the above udata updates (e.g. prog->aux->id)
|
||||
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
|
||||
*/
|
||||
smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* Error during st_ops->reg(). It is very unlikely since
|
||||
* the above init_member() should have caught it earlier
|
||||
* before reg(). The only possibility is if there was a race
|
||||
* in registering the struct_ops (under the same name) to
|
||||
* a sub-system through different struct_ops's maps.
|
||||
*/
|
||||
set_memory_nx((long)st_map->image, 1);
|
||||
set_memory_rw((long)st_map->image, 1);
|
||||
bpf_map_put(map);
|
||||
|
||||
reset_unlock:
|
||||
bpf_struct_ops_map_put_progs(st_map);
|
||||
memset(uvalue, 0, map->value_size);
|
||||
memset(kvalue, 0, map->value_size);
|
||||
unlock:
|
||||
mutex_unlock(&st_map->lock);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
|
||||
{
|
||||
enum bpf_struct_ops_state prev_state;
|
||||
struct bpf_struct_ops_map *st_map;
|
||||
|
||||
st_map = (struct bpf_struct_ops_map *)map;
|
||||
prev_state = cmpxchg(&st_map->kvalue.state,
|
||||
BPF_STRUCT_OPS_STATE_INUSE,
|
||||
BPF_STRUCT_OPS_STATE_TOBEFREE);
|
||||
if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) {
|
||||
st_map->st_ops->unreg(&st_map->kvalue.data);
|
||||
if (refcount_dec_and_test(&st_map->kvalue.refcnt))
|
||||
bpf_map_put(map);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
|
||||
struct seq_file *m)
|
||||
{
|
||||
void *value;
|
||||
int err;
|
||||
|
||||
value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
|
||||
if (!value)
|
||||
return;
|
||||
|
||||
err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
|
||||
if (!err) {
|
||||
btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id,
|
||||
value, m);
|
||||
seq_puts(m, "\n");
|
||||
}
|
||||
|
||||
kfree(value);
|
||||
}
|
||||
|
||||
static void bpf_struct_ops_map_free(struct bpf_map *map)
|
||||
{
|
||||
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
|
||||
|
||||
if (st_map->progs)
|
||||
bpf_struct_ops_map_put_progs(st_map);
|
||||
bpf_map_area_free(st_map->progs);
|
||||
bpf_jit_free_exec(st_map->image);
|
||||
bpf_map_area_free(st_map->uvalue);
|
||||
bpf_map_area_free(st_map);
|
||||
}
|
||||
|
||||
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
|
||||
{
|
||||
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
|
||||
attr->map_flags || !attr->btf_vmlinux_value_type_id)
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
const struct bpf_struct_ops *st_ops;
|
||||
size_t map_total_size, st_map_size;
|
||||
struct bpf_struct_ops_map *st_map;
|
||||
const struct btf_type *t, *vt;
|
||||
struct bpf_map_memory mem;
|
||||
struct bpf_map *map;
|
||||
int err;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
|
||||
if (!st_ops)
|
||||
return ERR_PTR(-ENOTSUPP);
|
||||
|
||||
vt = st_ops->value_type;
|
||||
if (attr->value_size != vt->size)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
t = st_ops->type;
|
||||
|
||||
st_map_size = sizeof(*st_map) +
|
||||
/* kvalue stores the
|
||||
* struct bpf_struct_ops_tcp_congestions_ops
|
||||
*/
|
||||
(vt->size - sizeof(struct bpf_struct_ops_value));
|
||||
map_total_size = st_map_size +
|
||||
/* uvalue */
|
||||
sizeof(vt->size) +
|
||||
/* struct bpf_progs **progs */
|
||||
btf_type_vlen(t) * sizeof(struct bpf_prog *);
|
||||
err = bpf_map_charge_init(&mem, map_total_size);
|
||||
if (err < 0)
|
||||
return ERR_PTR(err);
|
||||
|
||||
st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
|
||||
if (!st_map) {
|
||||
bpf_map_charge_finish(&mem);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
st_map->st_ops = st_ops;
|
||||
map = &st_map->map;
|
||||
|
||||
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
|
||||
st_map->progs =
|
||||
bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
|
||||
NUMA_NO_NODE);
|
||||
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
|
||||
if (!st_map->uvalue || !st_map->progs || !st_map->image) {
|
||||
bpf_struct_ops_map_free(map);
|
||||
bpf_map_charge_finish(&mem);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
mutex_init(&st_map->lock);
|
||||
set_vm_flush_reset_perms(st_map->image);
|
||||
bpf_map_init_from_attr(map, attr);
|
||||
bpf_map_charge_move(&map->memory, &mem);
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
const struct bpf_map_ops bpf_struct_ops_map_ops = {
|
||||
.map_alloc_check = bpf_struct_ops_map_alloc_check,
|
||||
.map_alloc = bpf_struct_ops_map_alloc,
|
||||
.map_free = bpf_struct_ops_map_free,
|
||||
.map_get_next_key = bpf_struct_ops_map_get_next_key,
|
||||
.map_lookup_elem = bpf_struct_ops_map_lookup_elem,
|
||||
.map_delete_elem = bpf_struct_ops_map_delete_elem,
|
||||
.map_update_elem = bpf_struct_ops_map_update_elem,
|
||||
.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
|
||||
};
|
||||
|
||||
/* "const void *" because some subsystem is
|
||||
* passing a const (e.g. const struct tcp_congestion_ops *)
|
||||
*/
|
||||
bool bpf_struct_ops_get(const void *kdata)
|
||||
{
|
||||
struct bpf_struct_ops_value *kvalue;
|
||||
|
||||
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
|
||||
|
||||
return refcount_inc_not_zero(&kvalue->refcnt);
|
||||
}
|
||||
|
||||
void bpf_struct_ops_put(const void *kdata)
|
||||
{
|
||||
struct bpf_struct_ops_value *kvalue;
|
||||
|
||||
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
|
||||
if (refcount_dec_and_test(&kvalue->refcnt)) {
|
||||
struct bpf_struct_ops_map *st_map;
|
||||
|
||||
st_map = container_of(kvalue, struct bpf_struct_ops_map,
|
||||
kvalue);
|
||||
bpf_map_put(&st_map->map);
|
||||
}
|
||||
}
|
||||
9
kernel/bpf/bpf_struct_ops_types.h
Normal file
9
kernel/bpf/bpf_struct_ops_types.h
Normal file
@@ -0,0 +1,9 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* internal file - do not include directly */
|
||||
|
||||
#ifdef CONFIG_BPF_JIT
|
||||
#ifdef CONFIG_INET
|
||||
#include <net/tcp.h>
|
||||
BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
|
||||
#endif
|
||||
#endif
|
||||
504
kernel/bpf/btf.c
504
kernel/bpf/btf.c
@@ -180,11 +180,6 @@
|
||||
*/
|
||||
#define BTF_MAX_SIZE (16 * 1024 * 1024)
|
||||
|
||||
#define for_each_member(i, struct_type, member) \
|
||||
for (i = 0, member = btf_type_member(struct_type); \
|
||||
i < btf_type_vlen(struct_type); \
|
||||
i++, member++)
|
||||
|
||||
#define for_each_member_from(i, from, struct_type, member) \
|
||||
for (i = from, member = btf_type_member(struct_type) + from; \
|
||||
i < btf_type_vlen(struct_type); \
|
||||
@@ -281,6 +276,11 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
|
||||
[BTF_KIND_DATASEC] = "DATASEC",
|
||||
};
|
||||
|
||||
static const char *btf_type_str(const struct btf_type *t)
|
||||
{
|
||||
return btf_kind_str[BTF_INFO_KIND(t->info)];
|
||||
}
|
||||
|
||||
struct btf_kind_operations {
|
||||
s32 (*check_meta)(struct btf_verifier_env *env,
|
||||
const struct btf_type *t,
|
||||
@@ -382,6 +382,65 @@ static bool btf_type_is_datasec(const struct btf_type *t)
|
||||
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
|
||||
}
|
||||
|
||||
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
|
||||
{
|
||||
const struct btf_type *t;
|
||||
const char *tname;
|
||||
u32 i;
|
||||
|
||||
for (i = 1; i <= btf->nr_types; i++) {
|
||||
t = btf->types[i];
|
||||
if (BTF_INFO_KIND(t->info) != kind)
|
||||
continue;
|
||||
|
||||
tname = btf_name_by_offset(btf, t->name_off);
|
||||
if (!strcmp(tname, name))
|
||||
return i;
|
||||
}
|
||||
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
|
||||
u32 id, u32 *res_id)
|
||||
{
|
||||
const struct btf_type *t = btf_type_by_id(btf, id);
|
||||
|
||||
while (btf_type_is_modifier(t)) {
|
||||
id = t->type;
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
}
|
||||
|
||||
if (res_id)
|
||||
*res_id = id;
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
|
||||
u32 id, u32 *res_id)
|
||||
{
|
||||
const struct btf_type *t;
|
||||
|
||||
t = btf_type_skip_modifiers(btf, id, NULL);
|
||||
if (!btf_type_is_ptr(t))
|
||||
return NULL;
|
||||
|
||||
return btf_type_skip_modifiers(btf, t->type, res_id);
|
||||
}
|
||||
|
||||
const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
|
||||
u32 id, u32 *res_id)
|
||||
{
|
||||
const struct btf_type *ptype;
|
||||
|
||||
ptype = btf_type_resolve_ptr(btf, id, res_id);
|
||||
if (ptype && btf_type_is_func_proto(ptype))
|
||||
return ptype;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Types that act only as a source, not sink or intermediate
|
||||
* type when resolving.
|
||||
*/
|
||||
@@ -446,30 +505,6 @@ static const char *btf_int_encoding_str(u8 encoding)
|
||||
return "UNKN";
|
||||
}
|
||||
|
||||
static u16 btf_type_vlen(const struct btf_type *t)
|
||||
{
|
||||
return BTF_INFO_VLEN(t->info);
|
||||
}
|
||||
|
||||
static bool btf_type_kflag(const struct btf_type *t)
|
||||
{
|
||||
return BTF_INFO_KFLAG(t->info);
|
||||
}
|
||||
|
||||
static u32 btf_member_bit_offset(const struct btf_type *struct_type,
|
||||
const struct btf_member *member)
|
||||
{
|
||||
return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
|
||||
: member->offset;
|
||||
}
|
||||
|
||||
static u32 btf_member_bitfield_size(const struct btf_type *struct_type,
|
||||
const struct btf_member *member)
|
||||
{
|
||||
return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
|
||||
: 0;
|
||||
}
|
||||
|
||||
static u32 btf_type_int(const struct btf_type *t)
|
||||
{
|
||||
return *(u32 *)(t + 1);
|
||||
@@ -480,11 +515,6 @@ static const struct btf_array *btf_type_array(const struct btf_type *t)
|
||||
return (const struct btf_array *)(t + 1);
|
||||
}
|
||||
|
||||
static const struct btf_member *btf_type_member(const struct btf_type *t)
|
||||
{
|
||||
return (const struct btf_member *)(t + 1);
|
||||
}
|
||||
|
||||
static const struct btf_enum *btf_type_enum(const struct btf_type *t)
|
||||
{
|
||||
return (const struct btf_enum *)(t + 1);
|
||||
@@ -1057,7 +1087,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
|
||||
* *elem_type: same as return type ("struct X")
|
||||
* *total_nelems: 1
|
||||
*/
|
||||
static const struct btf_type *
|
||||
const struct btf_type *
|
||||
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
|
||||
u32 *type_size, const struct btf_type **elem_type,
|
||||
u32 *total_nelems)
|
||||
@@ -1111,8 +1141,10 @@ resolved:
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
*type_size = nelems * size;
|
||||
*total_nelems = nelems;
|
||||
*elem_type = type;
|
||||
if (total_nelems)
|
||||
*total_nelems = nelems;
|
||||
if (elem_type)
|
||||
*elem_type = type;
|
||||
|
||||
return array_type ? : type;
|
||||
}
|
||||
@@ -1826,7 +1858,10 @@ static void btf_modifier_seq_show(const struct btf *btf,
|
||||
u32 type_id, void *data,
|
||||
u8 bits_offset, struct seq_file *m)
|
||||
{
|
||||
t = btf_type_id_resolve(btf, &type_id);
|
||||
if (btf->resolved_ids)
|
||||
t = btf_type_id_resolve(btf, &type_id);
|
||||
else
|
||||
t = btf_type_skip_modifiers(btf, type_id, NULL);
|
||||
|
||||
btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
|
||||
}
|
||||
@@ -2621,8 +2656,8 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (btf_type_vlen(t)) {
|
||||
btf_verifier_log_type(env, t, "vlen != 0");
|
||||
if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) {
|
||||
btf_verifier_log_type(env, t, "Invalid func linkage");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -3476,7 +3511,8 @@ static u8 bpf_ctx_convert_map[] = {
|
||||
|
||||
static const struct btf_member *
|
||||
btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
|
||||
const struct btf_type *t, enum bpf_prog_type prog_type)
|
||||
const struct btf_type *t, enum bpf_prog_type prog_type,
|
||||
int arg)
|
||||
{
|
||||
const struct btf_type *conv_struct;
|
||||
const struct btf_type *ctx_struct;
|
||||
@@ -3497,12 +3533,13 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
|
||||
* is not supported yet.
|
||||
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
|
||||
*/
|
||||
bpf_log(log, "BPF program ctx type is not a struct\n");
|
||||
if (log->level & BPF_LOG_LEVEL)
|
||||
bpf_log(log, "arg#%d type is not a struct\n", arg);
|
||||
return NULL;
|
||||
}
|
||||
tname = btf_name_by_offset(btf, t->name_off);
|
||||
if (!tname) {
|
||||
bpf_log(log, "BPF program ctx struct doesn't have a name\n");
|
||||
bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
|
||||
return NULL;
|
||||
}
|
||||
/* prog_type is valid bpf program type. No need for bounds check. */
|
||||
@@ -3535,11 +3572,12 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
|
||||
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
|
||||
struct btf *btf,
|
||||
const struct btf_type *t,
|
||||
enum bpf_prog_type prog_type)
|
||||
enum bpf_prog_type prog_type,
|
||||
int arg)
|
||||
{
|
||||
const struct btf_member *prog_ctx_type, *kern_ctx_type;
|
||||
|
||||
prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type);
|
||||
prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
|
||||
if (!prog_ctx_type)
|
||||
return -ENOENT;
|
||||
kern_ctx_type = prog_ctx_type + 1;
|
||||
@@ -3605,6 +3643,8 @@ struct btf *btf_parse_vmlinux(void)
|
||||
goto errout;
|
||||
}
|
||||
|
||||
bpf_struct_ops_init(btf);
|
||||
|
||||
btf_verifier_env_free(env);
|
||||
refcount_set(&btf->refcnt, 1);
|
||||
return btf;
|
||||
@@ -3629,6 +3669,19 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
|
||||
{
|
||||
/* t comes in already as a pointer */
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
|
||||
/* allow const */
|
||||
if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
|
||||
/* char, signed char, unsigned char */
|
||||
return btf_type_is_int(t) && t->size == 1;
|
||||
}
|
||||
|
||||
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
const struct bpf_prog *prog,
|
||||
struct bpf_insn_access_aux *info)
|
||||
@@ -3677,7 +3730,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
/* skip modifiers */
|
||||
while (btf_type_is_modifier(t))
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
if (btf_type_is_int(t))
|
||||
if (btf_type_is_int(t) || btf_type_is_enum(t))
|
||||
/* accessing a scalar */
|
||||
return true;
|
||||
if (!btf_type_is_ptr(t)) {
|
||||
@@ -3695,12 +3748,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
*/
|
||||
return true;
|
||||
|
||||
if (is_string_ptr(btf, t))
|
||||
return true;
|
||||
|
||||
/* this is a pointer to another type */
|
||||
info->reg_type = PTR_TO_BTF_ID;
|
||||
info->btf_id = t->type;
|
||||
|
||||
if (tgt_prog) {
|
||||
ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type);
|
||||
ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
|
||||
if (ret > 0) {
|
||||
info->btf_id = ret;
|
||||
return true;
|
||||
@@ -3708,10 +3763,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
info->btf_id = t->type;
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
/* skip modifiers */
|
||||
while (btf_type_is_modifier(t))
|
||||
while (btf_type_is_modifier(t)) {
|
||||
info->btf_id = t->type;
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
}
|
||||
if (!btf_type_is_struct(t)) {
|
||||
bpf_log(log,
|
||||
"func '%s' arg%d type %s is not a struct\n",
|
||||
@@ -3737,23 +3796,57 @@ int btf_struct_access(struct bpf_verifier_log *log,
|
||||
again:
|
||||
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
|
||||
if (!btf_type_is_struct(t)) {
|
||||
bpf_log(log, "Type '%s' is not a struct", tname);
|
||||
bpf_log(log, "Type '%s' is not a struct\n", tname);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
for_each_member(i, t, member) {
|
||||
if (btf_member_bitfield_size(t, member))
|
||||
/* bitfields are not supported yet */
|
||||
continue;
|
||||
if (off + size > t->size) {
|
||||
bpf_log(log, "access beyond struct %s at off %u size %u\n",
|
||||
tname, off, size);
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
for_each_member(i, t, member) {
|
||||
/* offset of the field in bytes */
|
||||
moff = btf_member_bit_offset(t, member) / 8;
|
||||
if (off + size <= moff)
|
||||
/* won't find anything, field is already too far */
|
||||
break;
|
||||
|
||||
if (btf_member_bitfield_size(t, member)) {
|
||||
u32 end_bit = btf_member_bit_offset(t, member) +
|
||||
btf_member_bitfield_size(t, member);
|
||||
|
||||
/* off <= moff instead of off == moff because clang
|
||||
* does not generate a BTF member for anonymous
|
||||
* bitfield like the ":16" here:
|
||||
* struct {
|
||||
* int :16;
|
||||
* int x:8;
|
||||
* };
|
||||
*/
|
||||
if (off <= moff &&
|
||||
BITS_ROUNDUP_BYTES(end_bit) <= off + size)
|
||||
return SCALAR_VALUE;
|
||||
|
||||
/* off may be accessing a following member
|
||||
*
|
||||
* or
|
||||
*
|
||||
* Doing partial access at either end of this
|
||||
* bitfield. Continue on this case also to
|
||||
* treat it as not accessing this bitfield
|
||||
* and eventually error out as field not
|
||||
* found to keep it simple.
|
||||
* It could be relaxed if there was a legit
|
||||
* partial access case later.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
/* In case of "off" is pointing to holes of a struct */
|
||||
if (off < moff)
|
||||
continue;
|
||||
break;
|
||||
|
||||
/* type of the field */
|
||||
mtype = btf_type_by_id(btf_vmlinux, member->type);
|
||||
@@ -4043,11 +4136,158 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
|
||||
/* Compare BTFs of two functions assuming only scalars and pointers to context.
|
||||
* t1 points to BTF_KIND_FUNC in btf1
|
||||
* t2 points to BTF_KIND_FUNC in btf2
|
||||
* Returns:
|
||||
* EINVAL - function prototype mismatch
|
||||
* EFAULT - verifier bug
|
||||
* 0 - 99% match. The last 1% is validated by the verifier.
|
||||
*/
|
||||
int btf_check_func_type_match(struct bpf_verifier_log *log,
|
||||
struct btf *btf1, const struct btf_type *t1,
|
||||
struct btf *btf2, const struct btf_type *t2)
|
||||
{
|
||||
const struct btf_param *args1, *args2;
|
||||
const char *fn1, *fn2, *s1, *s2;
|
||||
u32 nargs1, nargs2, i;
|
||||
|
||||
fn1 = btf_name_by_offset(btf1, t1->name_off);
|
||||
fn2 = btf_name_by_offset(btf2, t2->name_off);
|
||||
|
||||
if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) {
|
||||
bpf_log(log, "%s() is not a global function\n", fn1);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) {
|
||||
bpf_log(log, "%s() is not a global function\n", fn2);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
t1 = btf_type_by_id(btf1, t1->type);
|
||||
if (!t1 || !btf_type_is_func_proto(t1))
|
||||
return -EFAULT;
|
||||
t2 = btf_type_by_id(btf2, t2->type);
|
||||
if (!t2 || !btf_type_is_func_proto(t2))
|
||||
return -EFAULT;
|
||||
|
||||
args1 = (const struct btf_param *)(t1 + 1);
|
||||
nargs1 = btf_type_vlen(t1);
|
||||
args2 = (const struct btf_param *)(t2 + 1);
|
||||
nargs2 = btf_type_vlen(t2);
|
||||
|
||||
if (nargs1 != nargs2) {
|
||||
bpf_log(log, "%s() has %d args while %s() has %d args\n",
|
||||
fn1, nargs1, fn2, nargs2);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
|
||||
t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
|
||||
if (t1->info != t2->info) {
|
||||
bpf_log(log,
|
||||
"Return type %s of %s() doesn't match type %s of %s()\n",
|
||||
btf_type_str(t1), fn1,
|
||||
btf_type_str(t2), fn2);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
for (i = 0; i < nargs1; i++) {
|
||||
t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
|
||||
t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL);
|
||||
|
||||
if (t1->info != t2->info) {
|
||||
bpf_log(log, "arg%d in %s() is %s while %s() has %s\n",
|
||||
i, fn1, btf_type_str(t1),
|
||||
fn2, btf_type_str(t2));
|
||||
return -EINVAL;
|
||||
}
|
||||
if (btf_type_has_size(t1) && t1->size != t2->size) {
|
||||
bpf_log(log,
|
||||
"arg%d in %s() has size %d while %s() has %d\n",
|
||||
i, fn1, t1->size,
|
||||
fn2, t2->size);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* global functions are validated with scalars and pointers
|
||||
* to context only. And only global functions can be replaced.
|
||||
* Hence type check only those types.
|
||||
*/
|
||||
if (btf_type_is_int(t1) || btf_type_is_enum(t1))
|
||||
continue;
|
||||
if (!btf_type_is_ptr(t1)) {
|
||||
bpf_log(log,
|
||||
"arg%d in %s() has unrecognized type\n",
|
||||
i, fn1);
|
||||
return -EINVAL;
|
||||
}
|
||||
t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
|
||||
t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
|
||||
if (!btf_type_is_struct(t1)) {
|
||||
bpf_log(log,
|
||||
"arg%d in %s() is not a pointer to context\n",
|
||||
i, fn1);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (!btf_type_is_struct(t2)) {
|
||||
bpf_log(log,
|
||||
"arg%d in %s() is not a pointer to context\n",
|
||||
i, fn2);
|
||||
return -EINVAL;
|
||||
}
|
||||
/* This is an optional check to make program writing easier.
|
||||
* Compare names of structs and report an error to the user.
|
||||
* btf_prepare_func_args() already checked that t2 struct
|
||||
* is a context type. btf_prepare_func_args() will check
|
||||
* later that t1 struct is a context type as well.
|
||||
*/
|
||||
s1 = btf_name_by_offset(btf1, t1->name_off);
|
||||
s2 = btf_name_by_offset(btf2, t2->name_off);
|
||||
if (strcmp(s1, s2)) {
|
||||
bpf_log(log,
|
||||
"arg%d %s(struct %s *) doesn't match %s(struct %s *)\n",
|
||||
i, fn1, s1, fn2, s2);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Compare BTFs of given program with BTF of target program */
|
||||
int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
|
||||
struct btf *btf2, const struct btf_type *t2)
|
||||
{
|
||||
struct btf *btf1 = prog->aux->btf;
|
||||
const struct btf_type *t1;
|
||||
u32 btf_id = 0;
|
||||
|
||||
if (!prog->aux->func_info) {
|
||||
bpf_log(&env->log, "Program extension requires BTF\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
btf_id = prog->aux->func_info[0].type_id;
|
||||
if (!btf_id)
|
||||
return -EFAULT;
|
||||
|
||||
t1 = btf_type_by_id(btf1, btf_id);
|
||||
if (!t1 || !btf_type_is_func(t1))
|
||||
return -EFAULT;
|
||||
|
||||
return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2);
|
||||
}
|
||||
|
||||
/* Compare BTF of a function with given bpf_reg_state.
|
||||
* Returns:
|
||||
* EFAULT - there is a verifier bug. Abort verification.
|
||||
* EINVAL - there is a type mismatch or BTF is not available.
|
||||
* 0 - BTF matches with what bpf_reg_state expects.
|
||||
* Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
|
||||
*/
|
||||
int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
|
||||
struct bpf_reg_state *reg)
|
||||
{
|
||||
struct bpf_verifier_state *st = env->cur_state;
|
||||
struct bpf_func_state *func = st->frame[st->curframe];
|
||||
struct bpf_reg_state *reg = func->regs;
|
||||
struct bpf_verifier_log *log = &env->log;
|
||||
struct bpf_prog *prog = env->prog;
|
||||
struct btf *btf = prog->aux->btf;
|
||||
@@ -4057,27 +4297,30 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
|
||||
const char *tname;
|
||||
|
||||
if (!prog->aux->func_info)
|
||||
return 0;
|
||||
return -EINVAL;
|
||||
|
||||
btf_id = prog->aux->func_info[subprog].type_id;
|
||||
if (!btf_id)
|
||||
return 0;
|
||||
return -EFAULT;
|
||||
|
||||
if (prog->aux->func_info_aux[subprog].unreliable)
|
||||
return 0;
|
||||
return -EINVAL;
|
||||
|
||||
t = btf_type_by_id(btf, btf_id);
|
||||
if (!t || !btf_type_is_func(t)) {
|
||||
bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n",
|
||||
/* These checks were already done by the verifier while loading
|
||||
* struct bpf_func_info
|
||||
*/
|
||||
bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
|
||||
subprog);
|
||||
return -EINVAL;
|
||||
return -EFAULT;
|
||||
}
|
||||
tname = btf_name_by_offset(btf, t->name_off);
|
||||
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
if (!t || !btf_type_is_func_proto(t)) {
|
||||
bpf_log(log, "Invalid type of func %s\n", tname);
|
||||
return -EINVAL;
|
||||
bpf_log(log, "Invalid BTF of func %s\n", tname);
|
||||
return -EFAULT;
|
||||
}
|
||||
args = (const struct btf_param *)(t + 1);
|
||||
nargs = btf_type_vlen(t);
|
||||
@@ -4103,25 +4346,130 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
|
||||
bpf_log(log, "R%d is not a pointer\n", i + 1);
|
||||
goto out;
|
||||
}
|
||||
/* If program is passing PTR_TO_CTX into subprogram
|
||||
* check that BTF type matches.
|
||||
/* If function expects ctx type in BTF check that caller
|
||||
* is passing PTR_TO_CTX.
|
||||
*/
|
||||
if (reg[i + 1].type == PTR_TO_CTX &&
|
||||
!btf_get_prog_ctx_type(log, btf, t, prog->type))
|
||||
goto out;
|
||||
/* All other pointers are ok */
|
||||
continue;
|
||||
if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
|
||||
if (reg[i + 1].type != PTR_TO_CTX) {
|
||||
bpf_log(log,
|
||||
"arg#%d expected pointer to ctx, but got %s\n",
|
||||
i, btf_kind_str[BTF_INFO_KIND(t->info)]);
|
||||
goto out;
|
||||
}
|
||||
if (check_ctx_reg(env, ®[i + 1], i + 1))
|
||||
goto out;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
bpf_log(log, "Unrecognized argument type %s\n",
|
||||
btf_kind_str[BTF_INFO_KIND(t->info)]);
|
||||
bpf_log(log, "Unrecognized arg#%d type %s\n",
|
||||
i, btf_kind_str[BTF_INFO_KIND(t->info)]);
|
||||
goto out;
|
||||
}
|
||||
return 0;
|
||||
out:
|
||||
/* LLVM optimizations can remove arguments from static functions. */
|
||||
bpf_log(log,
|
||||
"Type info disagrees with actual arguments due to compiler optimizations\n");
|
||||
/* Compiler optimizations can remove arguments from static functions
|
||||
* or mismatched type can be passed into a global function.
|
||||
* In such cases mark the function as unreliable from BTF point of view.
|
||||
*/
|
||||
prog->aux->func_info_aux[subprog].unreliable = true;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Convert BTF of a function into bpf_reg_state if possible
|
||||
* Returns:
|
||||
* EFAULT - there is a verifier bug. Abort verification.
|
||||
* EINVAL - cannot convert BTF.
|
||||
* 0 - Successfully converted BTF into bpf_reg_state
|
||||
* (either PTR_TO_CTX or SCALAR_VALUE).
|
||||
*/
|
||||
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
|
||||
struct bpf_reg_state *reg)
|
||||
{
|
||||
struct bpf_verifier_log *log = &env->log;
|
||||
struct bpf_prog *prog = env->prog;
|
||||
enum bpf_prog_type prog_type = prog->type;
|
||||
struct btf *btf = prog->aux->btf;
|
||||
const struct btf_param *args;
|
||||
const struct btf_type *t;
|
||||
u32 i, nargs, btf_id;
|
||||
const char *tname;
|
||||
|
||||
if (!prog->aux->func_info ||
|
||||
prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
|
||||
bpf_log(log, "Verifier bug\n");
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
btf_id = prog->aux->func_info[subprog].type_id;
|
||||
if (!btf_id) {
|
||||
bpf_log(log, "Global functions need valid BTF\n");
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
t = btf_type_by_id(btf, btf_id);
|
||||
if (!t || !btf_type_is_func(t)) {
|
||||
/* These checks were already done by the verifier while loading
|
||||
* struct bpf_func_info
|
||||
*/
|
||||
bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
|
||||
subprog);
|
||||
return -EFAULT;
|
||||
}
|
||||
tname = btf_name_by_offset(btf, t->name_off);
|
||||
|
||||
if (log->level & BPF_LOG_LEVEL)
|
||||
bpf_log(log, "Validating %s() func#%d...\n",
|
||||
tname, subprog);
|
||||
|
||||
if (prog->aux->func_info_aux[subprog].unreliable) {
|
||||
bpf_log(log, "Verifier bug in function %s()\n", tname);
|
||||
return -EFAULT;
|
||||
}
|
||||
if (prog_type == BPF_PROG_TYPE_EXT)
|
||||
prog_type = prog->aux->linked_prog->type;
|
||||
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
if (!t || !btf_type_is_func_proto(t)) {
|
||||
bpf_log(log, "Invalid type of function %s()\n", tname);
|
||||
return -EFAULT;
|
||||
}
|
||||
args = (const struct btf_param *)(t + 1);
|
||||
nargs = btf_type_vlen(t);
|
||||
if (nargs > 5) {
|
||||
bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
|
||||
tname, nargs);
|
||||
return -EINVAL;
|
||||
}
|
||||
/* check that function returns int */
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
while (btf_type_is_modifier(t))
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
|
||||
bpf_log(log,
|
||||
"Global function %s() doesn't return scalar. Only those are supported.\n",
|
||||
tname);
|
||||
return -EINVAL;
|
||||
}
|
||||
/* Convert BTF function arguments into verifier types.
|
||||
* Only PTR_TO_CTX and SCALAR are supported atm.
|
||||
*/
|
||||
for (i = 0; i < nargs; i++) {
|
||||
t = btf_type_by_id(btf, args[i].type);
|
||||
while (btf_type_is_modifier(t))
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
if (btf_type_is_int(t) || btf_type_is_enum(t)) {
|
||||
reg[i + 1].type = SCALAR_VALUE;
|
||||
continue;
|
||||
}
|
||||
if (btf_type_is_ptr(t) &&
|
||||
btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
|
||||
reg[i + 1].type = PTR_TO_CTX;
|
||||
continue;
|
||||
}
|
||||
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
|
||||
i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
|
||||
return -EINVAL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -106,8 +106,7 @@ static u32 prog_list_length(struct list_head *head)
|
||||
* if parent has overridable or multi-prog, allow attaching
|
||||
*/
|
||||
static bool hierarchy_allows_attach(struct cgroup *cgrp,
|
||||
enum bpf_attach_type type,
|
||||
u32 new_flags)
|
||||
enum bpf_attach_type type)
|
||||
{
|
||||
struct cgroup *p;
|
||||
|
||||
@@ -290,31 +289,34 @@ cleanup:
|
||||
* propagate the change to descendants
|
||||
* @cgrp: The cgroup which descendants to traverse
|
||||
* @prog: A program to attach
|
||||
* @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
|
||||
* @type: Type of attach operation
|
||||
* @flags: Option flags
|
||||
*
|
||||
* Must be called with cgroup_mutex held.
|
||||
*/
|
||||
int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
|
||||
struct bpf_prog *replace_prog,
|
||||
enum bpf_attach_type type, u32 flags)
|
||||
{
|
||||
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
|
||||
struct list_head *progs = &cgrp->bpf.progs[type];
|
||||
struct bpf_prog *old_prog = NULL;
|
||||
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
|
||||
*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
|
||||
struct bpf_prog_list *pl, *replace_pl = NULL;
|
||||
enum bpf_cgroup_storage_type stype;
|
||||
struct bpf_prog_list *pl;
|
||||
bool pl_was_allocated;
|
||||
int err;
|
||||
|
||||
if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
|
||||
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
|
||||
((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
|
||||
/* invalid combination */
|
||||
return -EINVAL;
|
||||
|
||||
if (!hierarchy_allows_attach(cgrp, type, flags))
|
||||
if (!hierarchy_allows_attach(cgrp, type))
|
||||
return -EPERM;
|
||||
|
||||
if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
|
||||
if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
|
||||
/* Disallow attaching non-overridable on top
|
||||
* of existing overridable in this cgroup.
|
||||
* Disallow attaching multi-prog if overridable or none
|
||||
@@ -324,6 +326,21 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
|
||||
if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
|
||||
return -E2BIG;
|
||||
|
||||
if (flags & BPF_F_ALLOW_MULTI) {
|
||||
list_for_each_entry(pl, progs, node) {
|
||||
if (pl->prog == prog)
|
||||
/* disallow attaching the same prog twice */
|
||||
return -EINVAL;
|
||||
if (pl->prog == replace_prog)
|
||||
replace_pl = pl;
|
||||
}
|
||||
if ((flags & BPF_F_REPLACE) && !replace_pl)
|
||||
/* prog to replace not found for cgroup */
|
||||
return -ENOENT;
|
||||
} else if (!list_empty(progs)) {
|
||||
replace_pl = list_first_entry(progs, typeof(*pl), node);
|
||||
}
|
||||
|
||||
for_each_cgroup_storage_type(stype) {
|
||||
storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
|
||||
if (IS_ERR(storage[stype])) {
|
||||
@@ -334,53 +351,28 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & BPF_F_ALLOW_MULTI) {
|
||||
list_for_each_entry(pl, progs, node) {
|
||||
if (pl->prog == prog) {
|
||||
/* disallow attaching the same prog twice */
|
||||
for_each_cgroup_storage_type(stype)
|
||||
bpf_cgroup_storage_free(storage[stype]);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (replace_pl) {
|
||||
pl = replace_pl;
|
||||
old_prog = pl->prog;
|
||||
for_each_cgroup_storage_type(stype) {
|
||||
old_storage[stype] = pl->storage[stype];
|
||||
bpf_cgroup_storage_unlink(old_storage[stype]);
|
||||
}
|
||||
|
||||
} else {
|
||||
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
|
||||
if (!pl) {
|
||||
for_each_cgroup_storage_type(stype)
|
||||
bpf_cgroup_storage_free(storage[stype]);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
pl_was_allocated = true;
|
||||
pl->prog = prog;
|
||||
for_each_cgroup_storage_type(stype)
|
||||
pl->storage[stype] = storage[stype];
|
||||
list_add_tail(&pl->node, progs);
|
||||
} else {
|
||||
if (list_empty(progs)) {
|
||||
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
|
||||
if (!pl) {
|
||||
for_each_cgroup_storage_type(stype)
|
||||
bpf_cgroup_storage_free(storage[stype]);
|
||||
return -ENOMEM;
|
||||
}
|
||||
pl_was_allocated = true;
|
||||
list_add_tail(&pl->node, progs);
|
||||
} else {
|
||||
pl = list_first_entry(progs, typeof(*pl), node);
|
||||
old_prog = pl->prog;
|
||||
for_each_cgroup_storage_type(stype) {
|
||||
old_storage[stype] = pl->storage[stype];
|
||||
bpf_cgroup_storage_unlink(old_storage[stype]);
|
||||
}
|
||||
pl_was_allocated = false;
|
||||
}
|
||||
pl->prog = prog;
|
||||
for_each_cgroup_storage_type(stype)
|
||||
pl->storage[stype] = storage[stype];
|
||||
}
|
||||
|
||||
cgrp->bpf.flags[type] = flags;
|
||||
pl->prog = prog;
|
||||
for_each_cgroup_storage_type(stype)
|
||||
pl->storage[stype] = storage[stype];
|
||||
|
||||
cgrp->bpf.flags[type] = saved_flags;
|
||||
|
||||
err = update_effective_progs(cgrp, type);
|
||||
if (err)
|
||||
@@ -408,7 +400,7 @@ cleanup:
|
||||
pl->storage[stype] = old_storage[stype];
|
||||
bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
|
||||
}
|
||||
if (pl_was_allocated) {
|
||||
if (!replace_pl) {
|
||||
list_del(&pl->node);
|
||||
kfree(pl);
|
||||
}
|
||||
@@ -546,6 +538,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
|
||||
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
|
||||
enum bpf_prog_type ptype, struct bpf_prog *prog)
|
||||
{
|
||||
struct bpf_prog *replace_prog = NULL;
|
||||
struct cgroup *cgrp;
|
||||
int ret;
|
||||
|
||||
@@ -553,8 +546,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
|
||||
if (IS_ERR(cgrp))
|
||||
return PTR_ERR(cgrp);
|
||||
|
||||
ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
|
||||
if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
|
||||
(attr->attach_flags & BPF_F_REPLACE)) {
|
||||
replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
|
||||
if (IS_ERR(replace_prog)) {
|
||||
cgroup_put(cgrp);
|
||||
return PTR_ERR(replace_prog);
|
||||
}
|
||||
}
|
||||
|
||||
ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
|
||||
attr->attach_flags);
|
||||
|
||||
if (replace_prog)
|
||||
bpf_prog_put(replace_prog);
|
||||
cgroup_put(cgrp);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -222,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
|
||||
u32 pages, delta;
|
||||
int ret;
|
||||
|
||||
BUG_ON(fp_old == NULL);
|
||||
|
||||
size = round_up(size, PAGE_SIZE);
|
||||
pages = size / PAGE_SIZE;
|
||||
if (pages <= fp_old->pages)
|
||||
@@ -520,9 +518,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
|
||||
|
||||
#ifdef CONFIG_BPF_JIT
|
||||
/* All BPF JIT sysctl knobs here. */
|
||||
int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
|
||||
int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
|
||||
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
|
||||
int bpf_jit_harden __read_mostly;
|
||||
int bpf_jit_kallsyms __read_mostly;
|
||||
long bpf_jit_limit __read_mostly;
|
||||
|
||||
static __always_inline void
|
||||
@@ -2139,6 +2137,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
|
||||
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
|
||||
const struct bpf_func_proto bpf_spin_lock_proto __weak;
|
||||
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
|
||||
const struct bpf_func_proto bpf_jiffies64_proto __weak;
|
||||
|
||||
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
|
||||
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
|
||||
|
||||
@@ -72,17 +72,18 @@ struct bpf_cpu_map {
|
||||
struct bpf_map map;
|
||||
/* Below members specific for map type */
|
||||
struct bpf_cpu_map_entry **cpu_map;
|
||||
struct list_head __percpu *flush_list;
|
||||
};
|
||||
|
||||
static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
|
||||
static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
|
||||
|
||||
static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
|
||||
|
||||
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
struct bpf_cpu_map *cmap;
|
||||
int err = -ENOMEM;
|
||||
int ret, cpu;
|
||||
u64 cost;
|
||||
int ret;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return ERR_PTR(-EPERM);
|
||||
@@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
|
||||
|
||||
/* make sure page count doesn't overflow */
|
||||
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
|
||||
cost += sizeof(struct list_head) * num_possible_cpus();
|
||||
|
||||
/* Notice returns -EPERM on if map size is larger than memlock limit */
|
||||
ret = bpf_map_charge_init(&cmap->map.memory, cost);
|
||||
@@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
|
||||
goto free_cmap;
|
||||
}
|
||||
|
||||
cmap->flush_list = alloc_percpu(struct list_head);
|
||||
if (!cmap->flush_list)
|
||||
goto free_charge;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
|
||||
|
||||
/* Alloc array for possible remote "destination" CPUs */
|
||||
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
|
||||
sizeof(struct bpf_cpu_map_entry *),
|
||||
cmap->map.numa_node);
|
||||
if (!cmap->cpu_map)
|
||||
goto free_percpu;
|
||||
goto free_charge;
|
||||
|
||||
return &cmap->map;
|
||||
free_percpu:
|
||||
free_percpu(cmap->flush_list);
|
||||
free_charge:
|
||||
bpf_map_charge_finish(&cmap->map.memory);
|
||||
free_cmap:
|
||||
@@ -399,22 +390,14 @@ free_rcu:
|
||||
static void __cpu_map_entry_free(struct rcu_head *rcu)
|
||||
{
|
||||
struct bpf_cpu_map_entry *rcpu;
|
||||
int cpu;
|
||||
|
||||
/* This cpu_map_entry have been disconnected from map and one
|
||||
* RCU graze-period have elapsed. Thus, XDP cannot queue any
|
||||
* RCU grace-period have elapsed. Thus, XDP cannot queue any
|
||||
* new packets and cannot change/set flush_needed that can
|
||||
* find this entry.
|
||||
*/
|
||||
rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
|
||||
|
||||
/* Flush remaining packets in percpu bulkq */
|
||||
for_each_online_cpu(cpu) {
|
||||
struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
|
||||
|
||||
/* No concurrent bq_enqueue can run at this point */
|
||||
bq_flush_to_queue(bq, false);
|
||||
}
|
||||
free_percpu(rcpu->bulkq);
|
||||
/* Cannot kthread_stop() here, last put free rcpu resources */
|
||||
put_cpu_map_entry(rcpu);
|
||||
@@ -436,7 +419,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
|
||||
* percpu bulkq to queue. Due to caller map_delete_elem() disable
|
||||
* preemption, cannot call kthread_stop() to make sure queue is empty.
|
||||
* Instead a work_queue is started for stopping kthread,
|
||||
* cpu_map_kthread_stop, which waits for an RCU graze period before
|
||||
* cpu_map_kthread_stop, which waits for an RCU grace period before
|
||||
* stopping kthread, emptying the queue.
|
||||
*/
|
||||
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
|
||||
@@ -507,7 +490,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
|
||||
static void cpu_map_free(struct bpf_map *map)
|
||||
{
|
||||
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
|
||||
int cpu;
|
||||
u32 i;
|
||||
|
||||
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
|
||||
@@ -522,18 +504,6 @@ static void cpu_map_free(struct bpf_map *map)
|
||||
bpf_clear_redirect_map(map);
|
||||
synchronize_rcu();
|
||||
|
||||
/* To ensure all pending flush operations have completed wait for flush
|
||||
* list be empty on _all_ cpus. Because the above synchronize_rcu()
|
||||
* ensures the map is disconnected from the program we can assume no new
|
||||
* items will be added to the list.
|
||||
*/
|
||||
for_each_online_cpu(cpu) {
|
||||
struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
|
||||
|
||||
while (!list_empty(flush_list))
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
/* For cpu_map the remote CPUs can still be using the entries
|
||||
* (struct bpf_cpu_map_entry).
|
||||
*/
|
||||
@@ -544,10 +514,9 @@ static void cpu_map_free(struct bpf_map *map)
|
||||
if (!rcpu)
|
||||
continue;
|
||||
|
||||
/* bq flush and cleanup happens after RCU graze-period */
|
||||
/* bq flush and cleanup happens after RCU grace-period */
|
||||
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
|
||||
}
|
||||
free_percpu(cmap->flush_list);
|
||||
bpf_map_area_free(cmap->cpu_map);
|
||||
kfree(cmap);
|
||||
}
|
||||
@@ -599,7 +568,7 @@ const struct bpf_map_ops cpu_map_ops = {
|
||||
.map_check_btf = map_check_no_btf,
|
||||
};
|
||||
|
||||
static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
|
||||
static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
|
||||
{
|
||||
struct bpf_cpu_map_entry *rcpu = bq->obj;
|
||||
unsigned int processed = 0, drops = 0;
|
||||
@@ -620,10 +589,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
|
||||
err = __ptr_ring_produce(q, xdpf);
|
||||
if (err) {
|
||||
drops++;
|
||||
if (likely(in_napi_ctx))
|
||||
xdp_return_frame_rx_napi(xdpf);
|
||||
else
|
||||
xdp_return_frame(xdpf);
|
||||
xdp_return_frame_rx_napi(xdpf);
|
||||
}
|
||||
processed++;
|
||||
}
|
||||
@@ -642,11 +608,11 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
|
||||
*/
|
||||
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
|
||||
{
|
||||
struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
|
||||
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
|
||||
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
|
||||
|
||||
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
|
||||
bq_flush_to_queue(bq, true);
|
||||
bq_flush_to_queue(bq);
|
||||
|
||||
/* Notice, xdp_buff/page MUST be queued here, long enough for
|
||||
* driver to code invoking us to finished, due to driver
|
||||
@@ -681,16 +647,26 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __cpu_map_flush(struct bpf_map *map)
|
||||
void __cpu_map_flush(void)
|
||||
{
|
||||
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
|
||||
struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
|
||||
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
|
||||
struct xdp_bulk_queue *bq, *tmp;
|
||||
|
||||
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
|
||||
bq_flush_to_queue(bq, true);
|
||||
bq_flush_to_queue(bq);
|
||||
|
||||
/* If already running, costs spin_lock_irqsave + smb_mb */
|
||||
wake_up_process(bq->obj->kthread);
|
||||
}
|
||||
}
|
||||
|
||||
static int __init cpu_map_init(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
|
||||
return 0;
|
||||
}
|
||||
|
||||
subsys_initcall(cpu_map_init);
|
||||
|
||||
@@ -53,13 +53,11 @@
|
||||
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
|
||||
|
||||
#define DEV_MAP_BULK_SIZE 16
|
||||
struct bpf_dtab_netdev;
|
||||
|
||||
struct xdp_bulk_queue {
|
||||
struct xdp_dev_bulk_queue {
|
||||
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
|
||||
struct list_head flush_node;
|
||||
struct net_device *dev;
|
||||
struct net_device *dev_rx;
|
||||
struct bpf_dtab_netdev *obj;
|
||||
unsigned int count;
|
||||
};
|
||||
|
||||
@@ -67,15 +65,13 @@ struct bpf_dtab_netdev {
|
||||
struct net_device *dev; /* must be first member, due to tracepoint */
|
||||
struct hlist_node index_hlist;
|
||||
struct bpf_dtab *dtab;
|
||||
struct xdp_bulk_queue __percpu *bulkq;
|
||||
struct rcu_head rcu;
|
||||
unsigned int idx; /* keep track of map index for tracepoint */
|
||||
unsigned int idx;
|
||||
};
|
||||
|
||||
struct bpf_dtab {
|
||||
struct bpf_map map;
|
||||
struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
|
||||
struct list_head __percpu *flush_list;
|
||||
struct list_head list;
|
||||
|
||||
/* these are only used for DEVMAP_HASH type maps */
|
||||
@@ -85,6 +81,7 @@ struct bpf_dtab {
|
||||
u32 n_buckets;
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct list_head, dev_flush_list);
|
||||
static DEFINE_SPINLOCK(dev_map_lock);
|
||||
static LIST_HEAD(dev_map_list);
|
||||
|
||||
@@ -109,8 +106,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
|
||||
|
||||
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
|
||||
{
|
||||
int err, cpu;
|
||||
u64 cost;
|
||||
u64 cost = 0;
|
||||
int err;
|
||||
|
||||
/* check sanity of attributes */
|
||||
if (attr->max_entries == 0 || attr->key_size != 4 ||
|
||||
@@ -125,9 +122,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
|
||||
|
||||
bpf_map_init_from_attr(&dtab->map, attr);
|
||||
|
||||
/* make sure page count doesn't overflow */
|
||||
cost = (u64) sizeof(struct list_head) * num_possible_cpus();
|
||||
|
||||
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
|
||||
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
|
||||
|
||||
@@ -143,17 +137,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
|
||||
if (err)
|
||||
return -EINVAL;
|
||||
|
||||
dtab->flush_list = alloc_percpu(struct list_head);
|
||||
if (!dtab->flush_list)
|
||||
goto free_charge;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
|
||||
|
||||
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
|
||||
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
|
||||
if (!dtab->dev_index_head)
|
||||
goto free_percpu;
|
||||
goto free_charge;
|
||||
|
||||
spin_lock_init(&dtab->index_lock);
|
||||
} else {
|
||||
@@ -161,13 +148,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
|
||||
sizeof(struct bpf_dtab_netdev *),
|
||||
dtab->map.numa_node);
|
||||
if (!dtab->netdev_map)
|
||||
goto free_percpu;
|
||||
goto free_charge;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
free_percpu:
|
||||
free_percpu(dtab->flush_list);
|
||||
free_charge:
|
||||
bpf_map_charge_finish(&dtab->map.memory);
|
||||
return -ENOMEM;
|
||||
@@ -201,14 +186,16 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
|
||||
static void dev_map_free(struct bpf_map *map)
|
||||
{
|
||||
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
||||
int i, cpu;
|
||||
int i;
|
||||
|
||||
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
|
||||
* so the programs (can be more than one that used this map) were
|
||||
* disconnected from events. Wait for outstanding critical sections in
|
||||
* these programs to complete. The rcu critical section only guarantees
|
||||
* no further reads against netdev_map. It does __not__ ensure pending
|
||||
* flush operations (if any) are complete.
|
||||
* disconnected from events. The following synchronize_rcu() guarantees
|
||||
* both rcu read critical sections complete and waits for
|
||||
* preempt-disable regions (NAPI being the relevant context here) so we
|
||||
* are certain there will be no further reads against the netdev_map and
|
||||
* all flush operations are complete. Flush operations can only be done
|
||||
* from NAPI context for this reason.
|
||||
*/
|
||||
|
||||
spin_lock(&dev_map_lock);
|
||||
@@ -221,18 +208,6 @@ static void dev_map_free(struct bpf_map *map)
|
||||
/* Make sure prior __dev_map_entry_free() have completed. */
|
||||
rcu_barrier();
|
||||
|
||||
/* To ensure all pending flush operations have completed wait for flush
|
||||
* list to empty on _all_ cpus.
|
||||
* Because the above synchronize_rcu() ensures the map is disconnected
|
||||
* from the program we can assume no new items will be added.
|
||||
*/
|
||||
for_each_online_cpu(cpu) {
|
||||
struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
|
||||
|
||||
while (!list_empty(flush_list))
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
|
||||
for (i = 0; i < dtab->n_buckets; i++) {
|
||||
struct bpf_dtab_netdev *dev;
|
||||
@@ -243,7 +218,6 @@ static void dev_map_free(struct bpf_map *map)
|
||||
|
||||
hlist_for_each_entry_safe(dev, next, head, index_hlist) {
|
||||
hlist_del_rcu(&dev->index_hlist);
|
||||
free_percpu(dev->bulkq);
|
||||
dev_put(dev->dev);
|
||||
kfree(dev);
|
||||
}
|
||||
@@ -258,7 +232,6 @@ static void dev_map_free(struct bpf_map *map)
|
||||
if (!dev)
|
||||
continue;
|
||||
|
||||
free_percpu(dev->bulkq);
|
||||
dev_put(dev->dev);
|
||||
kfree(dev);
|
||||
}
|
||||
@@ -266,7 +239,6 @@ static void dev_map_free(struct bpf_map *map)
|
||||
bpf_map_area_free(dtab->netdev_map);
|
||||
}
|
||||
|
||||
free_percpu(dtab->flush_list);
|
||||
kfree(dtab);
|
||||
}
|
||||
|
||||
@@ -293,7 +265,8 @@ struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
|
||||
struct hlist_head *head = dev_map_index_hash(dtab, key);
|
||||
struct bpf_dtab_netdev *dev;
|
||||
|
||||
hlist_for_each_entry_rcu(dev, head, index_hlist)
|
||||
hlist_for_each_entry_rcu(dev, head, index_hlist,
|
||||
lockdep_is_held(&dtab->index_lock))
|
||||
if (dev->idx == key)
|
||||
return dev;
|
||||
|
||||
@@ -345,11 +318,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
|
||||
bool in_napi_ctx)
|
||||
static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
|
||||
{
|
||||
struct bpf_dtab_netdev *obj = bq->obj;
|
||||
struct net_device *dev = obj->dev;
|
||||
struct net_device *dev = bq->dev;
|
||||
int sent = 0, drops = 0, err = 0;
|
||||
int i;
|
||||
|
||||
@@ -372,8 +343,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
|
||||
out:
|
||||
bq->count = 0;
|
||||
|
||||
trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx,
|
||||
sent, drops, bq->dev_rx, dev, err);
|
||||
trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
|
||||
bq->dev_rx = NULL;
|
||||
__list_del_clearprev(&bq->flush_node);
|
||||
return 0;
|
||||
@@ -384,33 +354,29 @@ error:
|
||||
for (i = 0; i < bq->count; i++) {
|
||||
struct xdp_frame *xdpf = bq->q[i];
|
||||
|
||||
/* RX path under NAPI protection, can return frames faster */
|
||||
if (likely(in_napi_ctx))
|
||||
xdp_return_frame_rx_napi(xdpf);
|
||||
else
|
||||
xdp_return_frame(xdpf);
|
||||
xdp_return_frame_rx_napi(xdpf);
|
||||
drops++;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
|
||||
/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
|
||||
* from the driver before returning from its napi->poll() routine. The poll()
|
||||
* routine is called either from busy_poll context or net_rx_action signaled
|
||||
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
|
||||
* net device can be torn down. On devmap tear down we ensure the flush list
|
||||
* is empty before completing to ensure all flush operations have completed.
|
||||
* When drivers update the bpf program they may need to ensure any flush ops
|
||||
* are also complete. Using synchronize_rcu or call_rcu will suffice for this
|
||||
* because both wait for napi context to exit.
|
||||
*/
|
||||
void __dev_map_flush(struct bpf_map *map)
|
||||
void __dev_flush(void)
|
||||
{
|
||||
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
||||
struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
|
||||
struct xdp_bulk_queue *bq, *tmp;
|
||||
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
|
||||
struct xdp_dev_bulk_queue *bq, *tmp;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
|
||||
bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
|
||||
rcu_read_unlock();
|
||||
bq_xmit_all(bq, XDP_XMIT_FLUSH);
|
||||
}
|
||||
|
||||
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
|
||||
@@ -432,15 +398,14 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
|
||||
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
|
||||
* Thus, safe percpu variable access.
|
||||
*/
|
||||
static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
|
||||
static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
|
||||
struct net_device *dev_rx)
|
||||
|
||||
{
|
||||
struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
|
||||
struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
|
||||
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
|
||||
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
|
||||
|
||||
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
|
||||
bq_xmit_all(bq, 0, true);
|
||||
bq_xmit_all(bq, 0);
|
||||
|
||||
/* Ingress dev_rx will be the same for all xdp_frame's in
|
||||
* bulk_queue, because bq stored per-CPU and must be flushed
|
||||
@@ -457,10 +422,9 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
|
||||
struct net_device *dev_rx)
|
||||
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
|
||||
struct net_device *dev_rx)
|
||||
{
|
||||
struct net_device *dev = dst->dev;
|
||||
struct xdp_frame *xdpf;
|
||||
int err;
|
||||
|
||||
@@ -475,7 +439,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
|
||||
if (unlikely(!xdpf))
|
||||
return -EOVERFLOW;
|
||||
|
||||
return bq_enqueue(dst, xdpf, dev_rx);
|
||||
return bq_enqueue(dev, xdpf, dev_rx);
|
||||
}
|
||||
|
||||
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
|
||||
struct net_device *dev_rx)
|
||||
{
|
||||
return __xdp_enqueue(dev, xdp, dev_rx);
|
||||
}
|
||||
|
||||
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
|
||||
struct net_device *dev_rx)
|
||||
{
|
||||
struct net_device *dev = dst->dev;
|
||||
|
||||
return __xdp_enqueue(dev, xdp, dev_rx);
|
||||
}
|
||||
|
||||
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
|
||||
@@ -509,28 +487,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
|
||||
return dev ? &dev->ifindex : NULL;
|
||||
}
|
||||
|
||||
static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
|
||||
{
|
||||
if (dev->dev->netdev_ops->ndo_xdp_xmit) {
|
||||
struct xdp_bulk_queue *bq;
|
||||
int cpu;
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_online_cpu(cpu) {
|
||||
bq = per_cpu_ptr(dev->bulkq, cpu);
|
||||
bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
}
|
||||
|
||||
static void __dev_map_entry_free(struct rcu_head *rcu)
|
||||
{
|
||||
struct bpf_dtab_netdev *dev;
|
||||
|
||||
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
|
||||
dev_map_flush_old(dev);
|
||||
free_percpu(dev->bulkq);
|
||||
dev_put(dev->dev);
|
||||
kfree(dev);
|
||||
}
|
||||
@@ -545,12 +506,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
|
||||
return -EINVAL;
|
||||
|
||||
/* Use call_rcu() here to ensure any rcu critical sections have
|
||||
* completed, but this does not guarantee a flush has happened
|
||||
* yet. Because driver side rcu_read_lock/unlock only protects the
|
||||
* running XDP program. However, for pending flush operations the
|
||||
* dev and ctx are stored in another per cpu map. And additionally,
|
||||
* the driver tear down ensures all soft irqs are complete before
|
||||
* removing the net device in the case of dev_put equals zero.
|
||||
* completed as well as any flush operations because call_rcu
|
||||
* will wait for preempt-disable region to complete, NAPI in this
|
||||
* context. And additionally, the driver tear down ensures all
|
||||
* soft irqs are complete before removing the net device in the
|
||||
* case of dev_put equals zero.
|
||||
*/
|
||||
old_dev = xchg(&dtab->netdev_map[k], NULL);
|
||||
if (old_dev)
|
||||
@@ -585,30 +545,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
|
||||
u32 ifindex,
|
||||
unsigned int idx)
|
||||
{
|
||||
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
|
||||
struct bpf_dtab_netdev *dev;
|
||||
struct xdp_bulk_queue *bq;
|
||||
int cpu;
|
||||
|
||||
dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
|
||||
dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
|
||||
dtab->map.numa_node);
|
||||
if (!dev)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
|
||||
sizeof(void *), gfp);
|
||||
if (!dev->bulkq) {
|
||||
kfree(dev);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
bq = per_cpu_ptr(dev->bulkq, cpu);
|
||||
bq->obj = dev;
|
||||
}
|
||||
|
||||
dev->dev = dev_get_by_index(net, ifindex);
|
||||
if (!dev->dev) {
|
||||
free_percpu(dev->bulkq);
|
||||
kfree(dev);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
@@ -768,9 +713,23 @@ static int dev_map_notification(struct notifier_block *notifier,
|
||||
{
|
||||
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
|
||||
struct bpf_dtab *dtab;
|
||||
int i;
|
||||
int i, cpu;
|
||||
|
||||
switch (event) {
|
||||
case NETDEV_REGISTER:
|
||||
if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
|
||||
break;
|
||||
|
||||
/* will be freed in free_netdev() */
|
||||
netdev->xdp_bulkq =
|
||||
__alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
|
||||
sizeof(void *), GFP_ATOMIC);
|
||||
if (!netdev->xdp_bulkq)
|
||||
return NOTIFY_BAD;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
|
||||
break;
|
||||
case NETDEV_UNREGISTER:
|
||||
/* This rcu_read_lock/unlock pair is needed because
|
||||
* dev_map_list is an RCU list AND to ensure a delete
|
||||
@@ -810,10 +769,15 @@ static struct notifier_block dev_map_notifier = {
|
||||
|
||||
static int __init dev_map_init(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
|
||||
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
|
||||
offsetof(struct _bpf_dtab_netdev, dev));
|
||||
register_netdevice_notifier(&dev_map_notifier);
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
158
kernel/bpf/dispatcher.c
Normal file
158
kernel/bpf/dispatcher.c
Normal file
@@ -0,0 +1,158 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/* Copyright(c) 2019 Intel Corporation. */
|
||||
|
||||
#include <linux/hash.h>
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/filter.h>
|
||||
|
||||
/* The BPF dispatcher is a multiway branch code generator. The
|
||||
* dispatcher is a mechanism to avoid the performance penalty of an
|
||||
* indirect call, which is expensive when retpolines are enabled. A
|
||||
* dispatch client registers a BPF program into the dispatcher, and if
|
||||
* there is available room in the dispatcher a direct call to the BPF
|
||||
* program will be generated. All calls to the BPF programs called via
|
||||
* the dispatcher will then be a direct call, instead of an
|
||||
* indirect. The dispatcher hijacks a trampoline function it via the
|
||||
* __fentry__ of the trampoline. The trampoline function has the
|
||||
* following signature:
|
||||
*
|
||||
* unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi,
|
||||
* unsigned int (*bpf_func)(const void *,
|
||||
* const struct bpf_insn *));
|
||||
*/
|
||||
|
||||
static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog(
|
||||
struct bpf_dispatcher *d, struct bpf_prog *prog)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
|
||||
if (prog == d->progs[i].prog)
|
||||
return &d->progs[i];
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct bpf_dispatcher_prog *bpf_dispatcher_find_free(
|
||||
struct bpf_dispatcher *d)
|
||||
{
|
||||
return bpf_dispatcher_find_prog(d, NULL);
|
||||
}
|
||||
|
||||
static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d,
|
||||
struct bpf_prog *prog)
|
||||
{
|
||||
struct bpf_dispatcher_prog *entry;
|
||||
|
||||
if (!prog)
|
||||
return false;
|
||||
|
||||
entry = bpf_dispatcher_find_prog(d, prog);
|
||||
if (entry) {
|
||||
refcount_inc(&entry->users);
|
||||
return false;
|
||||
}
|
||||
|
||||
entry = bpf_dispatcher_find_free(d);
|
||||
if (!entry)
|
||||
return false;
|
||||
|
||||
bpf_prog_inc(prog);
|
||||
entry->prog = prog;
|
||||
refcount_set(&entry->users, 1);
|
||||
d->num_progs++;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
|
||||
struct bpf_prog *prog)
|
||||
{
|
||||
struct bpf_dispatcher_prog *entry;
|
||||
|
||||
if (!prog)
|
||||
return false;
|
||||
|
||||
entry = bpf_dispatcher_find_prog(d, prog);
|
||||
if (!entry)
|
||||
return false;
|
||||
|
||||
if (refcount_dec_and_test(&entry->users)) {
|
||||
entry->prog = NULL;
|
||||
bpf_prog_put(prog);
|
||||
d->num_progs--;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
|
||||
{
|
||||
return -ENOTSUPP;
|
||||
}
|
||||
|
||||
static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
|
||||
{
|
||||
s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
|
||||
int i;
|
||||
|
||||
for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
|
||||
if (d->progs[i].prog)
|
||||
*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
|
||||
}
|
||||
return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
|
||||
}
|
||||
|
||||
static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
|
||||
{
|
||||
void *old, *new;
|
||||
u32 noff;
|
||||
int err;
|
||||
|
||||
if (!prev_num_progs) {
|
||||
old = NULL;
|
||||
noff = 0;
|
||||
} else {
|
||||
old = d->image + d->image_off;
|
||||
noff = d->image_off ^ (BPF_IMAGE_SIZE / 2);
|
||||
}
|
||||
|
||||
new = d->num_progs ? d->image + noff : NULL;
|
||||
if (new) {
|
||||
if (bpf_dispatcher_prepare(d, new))
|
||||
return;
|
||||
}
|
||||
|
||||
err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
|
||||
if (err || !new)
|
||||
return;
|
||||
|
||||
d->image_off = noff;
|
||||
}
|
||||
|
||||
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
|
||||
struct bpf_prog *to)
|
||||
{
|
||||
bool changed = false;
|
||||
int prev_num_progs;
|
||||
|
||||
if (from == to)
|
||||
return;
|
||||
|
||||
mutex_lock(&d->mutex);
|
||||
if (!d->image) {
|
||||
d->image = bpf_image_alloc();
|
||||
if (!d->image)
|
||||
goto out;
|
||||
}
|
||||
|
||||
prev_num_progs = d->num_progs;
|
||||
changed |= bpf_dispatcher_remove_prog(d, from);
|
||||
changed |= bpf_dispatcher_add_prog(d, to);
|
||||
|
||||
if (!changed)
|
||||
goto out;
|
||||
|
||||
bpf_dispatcher_update(d, prev_num_progs);
|
||||
out:
|
||||
mutex_unlock(&d->mutex);
|
||||
}
|
||||
@@ -17,6 +17,16 @@
|
||||
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
|
||||
BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
|
||||
|
||||
#define BATCH_OPS(_name) \
|
||||
.map_lookup_batch = \
|
||||
_name##_map_lookup_batch, \
|
||||
.map_lookup_and_delete_batch = \
|
||||
_name##_map_lookup_and_delete_batch, \
|
||||
.map_update_batch = \
|
||||
generic_map_update_batch, \
|
||||
.map_delete_batch = \
|
||||
generic_map_delete_batch
|
||||
|
||||
struct bucket {
|
||||
struct hlist_nulls_head head;
|
||||
raw_spinlock_t lock;
|
||||
@@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static int
|
||||
__htab_map_lookup_and_delete_batch(struct bpf_map *map,
|
||||
const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr,
|
||||
bool do_delete, bool is_lru_map,
|
||||
bool is_percpu)
|
||||
{
|
||||
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
|
||||
u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
|
||||
void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
|
||||
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
|
||||
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
|
||||
void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
|
||||
u32 batch, max_count, size, bucket_size;
|
||||
u64 elem_map_flags, map_flags;
|
||||
struct hlist_nulls_head *head;
|
||||
struct hlist_nulls_node *n;
|
||||
unsigned long flags;
|
||||
struct htab_elem *l;
|
||||
struct bucket *b;
|
||||
int ret = 0;
|
||||
|
||||
elem_map_flags = attr->batch.elem_flags;
|
||||
if ((elem_map_flags & ~BPF_F_LOCK) ||
|
||||
((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
|
||||
return -EINVAL;
|
||||
|
||||
map_flags = attr->batch.flags;
|
||||
if (map_flags)
|
||||
return -EINVAL;
|
||||
|
||||
max_count = attr->batch.count;
|
||||
if (!max_count)
|
||||
return 0;
|
||||
|
||||
if (put_user(0, &uattr->batch.count))
|
||||
return -EFAULT;
|
||||
|
||||
batch = 0;
|
||||
if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
|
||||
return -EFAULT;
|
||||
|
||||
if (batch >= htab->n_buckets)
|
||||
return -ENOENT;
|
||||
|
||||
key_size = htab->map.key_size;
|
||||
roundup_key_size = round_up(htab->map.key_size, 8);
|
||||
value_size = htab->map.value_size;
|
||||
size = round_up(value_size, 8);
|
||||
if (is_percpu)
|
||||
value_size = size * num_possible_cpus();
|
||||
total = 0;
|
||||
/* while experimenting with hash tables with sizes ranging from 10 to
|
||||
* 1000, it was observed that a bucket can have upto 5 entries.
|
||||
*/
|
||||
bucket_size = 5;
|
||||
|
||||
alloc:
|
||||
/* We cannot do copy_from_user or copy_to_user inside
|
||||
* the rcu_read_lock. Allocate enough space here.
|
||||
*/
|
||||
keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
|
||||
values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
|
||||
if (!keys || !values) {
|
||||
ret = -ENOMEM;
|
||||
goto after_loop;
|
||||
}
|
||||
|
||||
again:
|
||||
preempt_disable();
|
||||
this_cpu_inc(bpf_prog_active);
|
||||
rcu_read_lock();
|
||||
again_nocopy:
|
||||
dst_key = keys;
|
||||
dst_val = values;
|
||||
b = &htab->buckets[batch];
|
||||
head = &b->head;
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
|
||||
bucket_cnt = 0;
|
||||
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
|
||||
bucket_cnt++;
|
||||
|
||||
if (bucket_cnt > (max_count - total)) {
|
||||
if (total == 0)
|
||||
ret = -ENOSPC;
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
rcu_read_unlock();
|
||||
this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
goto after_loop;
|
||||
}
|
||||
|
||||
if (bucket_cnt > bucket_size) {
|
||||
bucket_size = bucket_cnt;
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
rcu_read_unlock();
|
||||
this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
kvfree(keys);
|
||||
kvfree(values);
|
||||
goto alloc;
|
||||
}
|
||||
|
||||
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
|
||||
memcpy(dst_key, l->key, key_size);
|
||||
|
||||
if (is_percpu) {
|
||||
int off = 0, cpu;
|
||||
void __percpu *pptr;
|
||||
|
||||
pptr = htab_elem_get_ptr(l, map->key_size);
|
||||
for_each_possible_cpu(cpu) {
|
||||
bpf_long_memcpy(dst_val + off,
|
||||
per_cpu_ptr(pptr, cpu), size);
|
||||
off += size;
|
||||
}
|
||||
} else {
|
||||
value = l->key + roundup_key_size;
|
||||
if (elem_map_flags & BPF_F_LOCK)
|
||||
copy_map_value_locked(map, dst_val, value,
|
||||
true);
|
||||
else
|
||||
copy_map_value(map, dst_val, value);
|
||||
check_and_init_map_lock(map, dst_val);
|
||||
}
|
||||
if (do_delete) {
|
||||
hlist_nulls_del_rcu(&l->hash_node);
|
||||
if (is_lru_map)
|
||||
bpf_lru_push_free(&htab->lru, &l->lru_node);
|
||||
else
|
||||
free_htab_elem(htab, l);
|
||||
}
|
||||
dst_key += key_size;
|
||||
dst_val += value_size;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
/* If we are not copying data, we can go to next bucket and avoid
|
||||
* unlocking the rcu.
|
||||
*/
|
||||
if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
|
||||
batch++;
|
||||
goto again_nocopy;
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
|
||||
key_size * bucket_cnt) ||
|
||||
copy_to_user(uvalues + total * value_size, values,
|
||||
value_size * bucket_cnt))) {
|
||||
ret = -EFAULT;
|
||||
goto after_loop;
|
||||
}
|
||||
|
||||
total += bucket_cnt;
|
||||
batch++;
|
||||
if (batch >= htab->n_buckets) {
|
||||
ret = -ENOENT;
|
||||
goto after_loop;
|
||||
}
|
||||
goto again;
|
||||
|
||||
after_loop:
|
||||
if (ret == -EFAULT)
|
||||
goto out;
|
||||
|
||||
/* copy # of entries and next batch */
|
||||
ubatch = u64_to_user_ptr(attr->batch.out_batch);
|
||||
if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
|
||||
put_user(total, &uattr->batch.count))
|
||||
ret = -EFAULT;
|
||||
|
||||
out:
|
||||
kvfree(keys);
|
||||
kvfree(values);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
|
||||
false, true);
|
||||
}
|
||||
|
||||
static int
|
||||
htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
|
||||
const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
|
||||
false, true);
|
||||
}
|
||||
|
||||
static int
|
||||
htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
|
||||
false, false);
|
||||
}
|
||||
|
||||
static int
|
||||
htab_map_lookup_and_delete_batch(struct bpf_map *map,
|
||||
const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
|
||||
false, false);
|
||||
}
|
||||
|
||||
static int
|
||||
htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
|
||||
const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
|
||||
true, true);
|
||||
}
|
||||
|
||||
static int
|
||||
htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
|
||||
const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
|
||||
true, true);
|
||||
}
|
||||
|
||||
static int
|
||||
htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
|
||||
true, false);
|
||||
}
|
||||
|
||||
static int
|
||||
htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
|
||||
const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
|
||||
true, false);
|
||||
}
|
||||
|
||||
const struct bpf_map_ops htab_map_ops = {
|
||||
.map_alloc_check = htab_map_alloc_check,
|
||||
.map_alloc = htab_map_alloc,
|
||||
@@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = {
|
||||
.map_delete_elem = htab_map_delete_elem,
|
||||
.map_gen_lookup = htab_map_gen_lookup,
|
||||
.map_seq_show_elem = htab_map_seq_show_elem,
|
||||
BATCH_OPS(htab),
|
||||
};
|
||||
|
||||
const struct bpf_map_ops htab_lru_map_ops = {
|
||||
@@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
|
||||
.map_delete_elem = htab_lru_map_delete_elem,
|
||||
.map_gen_lookup = htab_lru_map_gen_lookup,
|
||||
.map_seq_show_elem = htab_map_seq_show_elem,
|
||||
BATCH_OPS(htab_lru),
|
||||
};
|
||||
|
||||
/* Called from eBPF program */
|
||||
@@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
|
||||
.map_update_elem = htab_percpu_map_update_elem,
|
||||
.map_delete_elem = htab_map_delete_elem,
|
||||
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
|
||||
BATCH_OPS(htab_percpu),
|
||||
};
|
||||
|
||||
const struct bpf_map_ops htab_lru_percpu_map_ops = {
|
||||
@@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
|
||||
.map_update_elem = htab_lru_percpu_map_update_elem,
|
||||
.map_delete_elem = htab_lru_map_delete_elem,
|
||||
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
|
||||
BATCH_OPS(htab_lru_percpu),
|
||||
};
|
||||
|
||||
static int fd_htab_map_alloc_check(union bpf_attr *attr)
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <linux/uidgid.h>
|
||||
#include <linux/filter.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/jiffies.h>
|
||||
|
||||
#include "../../lib/kstrtox.h"
|
||||
|
||||
@@ -312,6 +313,17 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
BPF_CALL_0(bpf_jiffies64)
|
||||
{
|
||||
return get_jiffies_64();
|
||||
}
|
||||
|
||||
const struct bpf_func_proto bpf_jiffies64_proto = {
|
||||
.func = bpf_jiffies64,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
BPF_CALL_0(bpf_get_current_cgroup_id)
|
||||
{
|
||||
|
||||
@@ -196,6 +196,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
|
||||
void *key = map_iter(m)->key;
|
||||
void *prev_key;
|
||||
|
||||
(*pos)++;
|
||||
if (map_iter(m)->done)
|
||||
return NULL;
|
||||
|
||||
@@ -208,8 +209,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
|
||||
map_iter(m)->done = true;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
++(*pos);
|
||||
return key;
|
||||
}
|
||||
|
||||
@@ -380,7 +379,7 @@ static const struct inode_operations bpf_dir_iops = {
|
||||
.unlink = simple_unlink,
|
||||
};
|
||||
|
||||
static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
|
||||
static int bpf_obj_do_pin(const char __user *pathname, void *raw,
|
||||
enum bpf_type type)
|
||||
{
|
||||
struct dentry *dentry;
|
||||
@@ -389,7 +388,7 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
|
||||
umode_t mode;
|
||||
int ret;
|
||||
|
||||
dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
|
||||
dentry = user_path_create(AT_FDCWD, pathname, &path, 0);
|
||||
if (IS_ERR(dentry))
|
||||
return PTR_ERR(dentry);
|
||||
|
||||
@@ -422,30 +421,22 @@ out:
|
||||
|
||||
int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
|
||||
{
|
||||
struct filename *pname;
|
||||
enum bpf_type type;
|
||||
void *raw;
|
||||
int ret;
|
||||
|
||||
pname = getname(pathname);
|
||||
if (IS_ERR(pname))
|
||||
return PTR_ERR(pname);
|
||||
|
||||
raw = bpf_fd_probe_obj(ufd, &type);
|
||||
if (IS_ERR(raw)) {
|
||||
ret = PTR_ERR(raw);
|
||||
goto out;
|
||||
}
|
||||
if (IS_ERR(raw))
|
||||
return PTR_ERR(raw);
|
||||
|
||||
ret = bpf_obj_do_pin(pname, raw, type);
|
||||
ret = bpf_obj_do_pin(pathname, raw, type);
|
||||
if (ret != 0)
|
||||
bpf_any_put(raw, type);
|
||||
out:
|
||||
putname(pname);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void *bpf_obj_do_get(const struct filename *pathname,
|
||||
static void *bpf_obj_do_get(const char __user *pathname,
|
||||
enum bpf_type *type, int flags)
|
||||
{
|
||||
struct inode *inode;
|
||||
@@ -453,7 +444,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
|
||||
void *raw;
|
||||
int ret;
|
||||
|
||||
ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
|
||||
ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
@@ -480,36 +471,27 @@ out:
|
||||
int bpf_obj_get_user(const char __user *pathname, int flags)
|
||||
{
|
||||
enum bpf_type type = BPF_TYPE_UNSPEC;
|
||||
struct filename *pname;
|
||||
int ret = -ENOENT;
|
||||
int f_flags;
|
||||
void *raw;
|
||||
int ret;
|
||||
|
||||
f_flags = bpf_get_file_flag(flags);
|
||||
if (f_flags < 0)
|
||||
return f_flags;
|
||||
|
||||
pname = getname(pathname);
|
||||
if (IS_ERR(pname))
|
||||
return PTR_ERR(pname);
|
||||
|
||||
raw = bpf_obj_do_get(pname, &type, f_flags);
|
||||
if (IS_ERR(raw)) {
|
||||
ret = PTR_ERR(raw);
|
||||
goto out;
|
||||
}
|
||||
raw = bpf_obj_do_get(pathname, &type, f_flags);
|
||||
if (IS_ERR(raw))
|
||||
return PTR_ERR(raw);
|
||||
|
||||
if (type == BPF_TYPE_PROG)
|
||||
ret = bpf_prog_new_fd(raw);
|
||||
else if (type == BPF_TYPE_MAP)
|
||||
ret = bpf_map_new_fd(raw, f_flags);
|
||||
else
|
||||
goto out;
|
||||
return -ENOENT;
|
||||
|
||||
if (ret < 0)
|
||||
bpf_any_put(raw, type);
|
||||
out:
|
||||
putname(pname);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
|
||||
*/
|
||||
if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
|
||||
inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
|
||||
inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
|
||||
inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ||
|
||||
inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
|
||||
fdput(f);
|
||||
return ERR_PTR(-ENOTSUPP);
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include <linux/timekeeping.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/nospec.h>
|
||||
#include <linux/audit.h>
|
||||
#include <uapi/linux/btf.h>
|
||||
|
||||
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
|
||||
@@ -128,6 +129,152 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
|
||||
return map;
|
||||
}
|
||||
|
||||
static u32 bpf_map_value_size(struct bpf_map *map)
|
||||
{
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
|
||||
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
|
||||
return round_up(map->value_size, 8) * num_possible_cpus();
|
||||
else if (IS_FD_MAP(map))
|
||||
return sizeof(u32);
|
||||
else
|
||||
return map->value_size;
|
||||
}
|
||||
|
||||
static void maybe_wait_bpf_programs(struct bpf_map *map)
|
||||
{
|
||||
/* Wait for any running BPF programs to complete so that
|
||||
* userspace, when we return to it, knows that all programs
|
||||
* that could be running use the new map value.
|
||||
*/
|
||||
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
|
||||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
|
||||
void *value, __u64 flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
/* Need to create a kthread, thus must support schedule */
|
||||
if (bpf_map_is_dev_bound(map)) {
|
||||
return bpf_map_offload_update_elem(map, key, value, flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
|
||||
map->map_type == BPF_MAP_TYPE_SOCKHASH ||
|
||||
map->map_type == BPF_MAP_TYPE_SOCKMAP ||
|
||||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
|
||||
return map->ops->map_update_elem(map, key, value, flags);
|
||||
} else if (IS_FD_PROG_ARRAY(map)) {
|
||||
return bpf_fd_array_map_update_elem(map, f.file, key, value,
|
||||
flags);
|
||||
}
|
||||
|
||||
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
|
||||
* inside bpf map update or delete otherwise deadlocks are possible
|
||||
*/
|
||||
preempt_disable();
|
||||
__this_cpu_inc(bpf_prog_active);
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
|
||||
err = bpf_percpu_hash_update(map, key, value, flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
|
||||
err = bpf_percpu_array_update(map, key, value, flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
|
||||
err = bpf_percpu_cgroup_storage_update(map, key, value,
|
||||
flags);
|
||||
} else if (IS_FD_ARRAY(map)) {
|
||||
rcu_read_lock();
|
||||
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
|
||||
flags);
|
||||
rcu_read_unlock();
|
||||
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
|
||||
rcu_read_lock();
|
||||
err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
|
||||
flags);
|
||||
rcu_read_unlock();
|
||||
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
|
||||
/* rcu_read_lock() is not needed */
|
||||
err = bpf_fd_reuseport_array_update_elem(map, key, value,
|
||||
flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
|
||||
map->map_type == BPF_MAP_TYPE_STACK) {
|
||||
err = map->ops->map_push_elem(map, value, flags);
|
||||
} else {
|
||||
rcu_read_lock();
|
||||
err = map->ops->map_update_elem(map, key, value, flags);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
__this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
maybe_wait_bpf_programs(map);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
|
||||
__u64 flags)
|
||||
{
|
||||
void *ptr;
|
||||
int err;
|
||||
|
||||
if (bpf_map_is_dev_bound(map))
|
||||
return bpf_map_offload_lookup_elem(map, key, value);
|
||||
|
||||
preempt_disable();
|
||||
this_cpu_inc(bpf_prog_active);
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
|
||||
err = bpf_percpu_hash_copy(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
|
||||
err = bpf_percpu_array_copy(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
|
||||
err = bpf_percpu_cgroup_storage_copy(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
|
||||
err = bpf_stackmap_copy(map, key, value);
|
||||
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
|
||||
err = bpf_fd_array_map_lookup_elem(map, key, value);
|
||||
} else if (IS_FD_HASH(map)) {
|
||||
err = bpf_fd_htab_map_lookup_elem(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
|
||||
err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
|
||||
map->map_type == BPF_MAP_TYPE_STACK) {
|
||||
err = map->ops->map_peek_elem(map, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
|
||||
/* struct_ops map requires directly updating "value" */
|
||||
err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
|
||||
} else {
|
||||
rcu_read_lock();
|
||||
if (map->ops->map_lookup_elem_sys_only)
|
||||
ptr = map->ops->map_lookup_elem_sys_only(map, key);
|
||||
else
|
||||
ptr = map->ops->map_lookup_elem(map, key);
|
||||
if (IS_ERR(ptr)) {
|
||||
err = PTR_ERR(ptr);
|
||||
} else if (!ptr) {
|
||||
err = -ENOENT;
|
||||
} else {
|
||||
err = 0;
|
||||
if (flags & BPF_F_LOCK)
|
||||
/* lock 'ptr' and copy everything but lock */
|
||||
copy_map_value_locked(map, value, ptr, true);
|
||||
else
|
||||
copy_map_value(map, value, ptr);
|
||||
/* mask lock, since value wasn't zero inited */
|
||||
check_and_init_map_lock(map, value);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
maybe_wait_bpf_programs(map);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
|
||||
{
|
||||
/* We really just want to fail instead of triggering OOM killer
|
||||
@@ -627,7 +774,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
|
||||
#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
|
||||
/* called via syscall */
|
||||
static int map_create(union bpf_attr *attr)
|
||||
{
|
||||
@@ -641,6 +788,14 @@ static int map_create(union bpf_attr *attr)
|
||||
if (err)
|
||||
return -EINVAL;
|
||||
|
||||
if (attr->btf_vmlinux_value_type_id) {
|
||||
if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
|
||||
attr->btf_key_type_id || attr->btf_value_type_id)
|
||||
return -EINVAL;
|
||||
} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
f_flags = bpf_get_file_flag(attr->map_flags);
|
||||
if (f_flags < 0)
|
||||
return f_flags;
|
||||
@@ -663,32 +818,35 @@ static int map_create(union bpf_attr *attr)
|
||||
atomic64_set(&map->usercnt, 1);
|
||||
mutex_init(&map->freeze_mutex);
|
||||
|
||||
if (attr->btf_key_type_id || attr->btf_value_type_id) {
|
||||
map->spin_lock_off = -EINVAL;
|
||||
if (attr->btf_key_type_id || attr->btf_value_type_id ||
|
||||
/* Even the map's value is a kernel's struct,
|
||||
* the bpf_prog.o must have BTF to begin with
|
||||
* to figure out the corresponding kernel's
|
||||
* counter part. Thus, attr->btf_fd has
|
||||
* to be valid also.
|
||||
*/
|
||||
attr->btf_vmlinux_value_type_id) {
|
||||
struct btf *btf;
|
||||
|
||||
if (!attr->btf_value_type_id) {
|
||||
err = -EINVAL;
|
||||
goto free_map;
|
||||
}
|
||||
|
||||
btf = btf_get_by_fd(attr->btf_fd);
|
||||
if (IS_ERR(btf)) {
|
||||
err = PTR_ERR(btf);
|
||||
goto free_map;
|
||||
}
|
||||
map->btf = btf;
|
||||
|
||||
err = map_check_btf(map, btf, attr->btf_key_type_id,
|
||||
attr->btf_value_type_id);
|
||||
if (err) {
|
||||
btf_put(btf);
|
||||
goto free_map;
|
||||
if (attr->btf_value_type_id) {
|
||||
err = map_check_btf(map, btf, attr->btf_key_type_id,
|
||||
attr->btf_value_type_id);
|
||||
if (err)
|
||||
goto free_map;
|
||||
}
|
||||
|
||||
map->btf = btf;
|
||||
map->btf_key_type_id = attr->btf_key_type_id;
|
||||
map->btf_value_type_id = attr->btf_value_type_id;
|
||||
} else {
|
||||
map->spin_lock_off = -EINVAL;
|
||||
map->btf_vmlinux_value_type_id =
|
||||
attr->btf_vmlinux_value_type_id;
|
||||
}
|
||||
|
||||
err = security_bpf_map_alloc(map);
|
||||
@@ -815,7 +973,7 @@ static int map_lookup_elem(union bpf_attr *attr)
|
||||
void __user *uvalue = u64_to_user_ptr(attr->value);
|
||||
int ufd = attr->map_fd;
|
||||
struct bpf_map *map;
|
||||
void *key, *value, *ptr;
|
||||
void *key, *value;
|
||||
u32 value_size;
|
||||
struct fd f;
|
||||
int err;
|
||||
@@ -847,72 +1005,14 @@ static int map_lookup_elem(union bpf_attr *attr)
|
||||
goto err_put;
|
||||
}
|
||||
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
|
||||
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
|
||||
value_size = round_up(map->value_size, 8) * num_possible_cpus();
|
||||
else if (IS_FD_MAP(map))
|
||||
value_size = sizeof(u32);
|
||||
else
|
||||
value_size = map->value_size;
|
||||
value_size = bpf_map_value_size(map);
|
||||
|
||||
err = -ENOMEM;
|
||||
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
|
||||
if (!value)
|
||||
goto free_key;
|
||||
|
||||
if (bpf_map_is_dev_bound(map)) {
|
||||
err = bpf_map_offload_lookup_elem(map, key, value);
|
||||
goto done;
|
||||
}
|
||||
|
||||
preempt_disable();
|
||||
this_cpu_inc(bpf_prog_active);
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
|
||||
err = bpf_percpu_hash_copy(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
|
||||
err = bpf_percpu_array_copy(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
|
||||
err = bpf_percpu_cgroup_storage_copy(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
|
||||
err = bpf_stackmap_copy(map, key, value);
|
||||
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
|
||||
err = bpf_fd_array_map_lookup_elem(map, key, value);
|
||||
} else if (IS_FD_HASH(map)) {
|
||||
err = bpf_fd_htab_map_lookup_elem(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
|
||||
err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
|
||||
map->map_type == BPF_MAP_TYPE_STACK) {
|
||||
err = map->ops->map_peek_elem(map, value);
|
||||
} else {
|
||||
rcu_read_lock();
|
||||
if (map->ops->map_lookup_elem_sys_only)
|
||||
ptr = map->ops->map_lookup_elem_sys_only(map, key);
|
||||
else
|
||||
ptr = map->ops->map_lookup_elem(map, key);
|
||||
if (IS_ERR(ptr)) {
|
||||
err = PTR_ERR(ptr);
|
||||
} else if (!ptr) {
|
||||
err = -ENOENT;
|
||||
} else {
|
||||
err = 0;
|
||||
if (attr->flags & BPF_F_LOCK)
|
||||
/* lock 'ptr' and copy everything but lock */
|
||||
copy_map_value_locked(map, value, ptr, true);
|
||||
else
|
||||
copy_map_value(map, value, ptr);
|
||||
/* mask lock, since value wasn't zero inited */
|
||||
check_and_init_map_lock(map, value);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
|
||||
done:
|
||||
err = bpf_map_copy_value(map, key, value, attr->flags);
|
||||
if (err)
|
||||
goto free_value;
|
||||
|
||||
@@ -931,16 +1031,6 @@ err_put:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void maybe_wait_bpf_programs(struct bpf_map *map)
|
||||
{
|
||||
/* Wait for any running BPF programs to complete so that
|
||||
* userspace, when we return to it, knows that all programs
|
||||
* that could be running use the new map value.
|
||||
*/
|
||||
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
|
||||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
|
||||
|
||||
@@ -996,60 +1086,8 @@ static int map_update_elem(union bpf_attr *attr)
|
||||
if (copy_from_user(value, uvalue, value_size) != 0)
|
||||
goto free_value;
|
||||
|
||||
/* Need to create a kthread, thus must support schedule */
|
||||
if (bpf_map_is_dev_bound(map)) {
|
||||
err = bpf_map_offload_update_elem(map, key, value, attr->flags);
|
||||
goto out;
|
||||
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
|
||||
map->map_type == BPF_MAP_TYPE_SOCKHASH ||
|
||||
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
|
||||
err = map->ops->map_update_elem(map, key, value, attr->flags);
|
||||
goto out;
|
||||
} else if (IS_FD_PROG_ARRAY(map)) {
|
||||
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
|
||||
attr->flags);
|
||||
goto out;
|
||||
}
|
||||
err = bpf_map_update_value(map, f, key, value, attr->flags);
|
||||
|
||||
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
|
||||
* inside bpf map update or delete otherwise deadlocks are possible
|
||||
*/
|
||||
preempt_disable();
|
||||
__this_cpu_inc(bpf_prog_active);
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
|
||||
err = bpf_percpu_hash_update(map, key, value, attr->flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
|
||||
err = bpf_percpu_array_update(map, key, value, attr->flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
|
||||
err = bpf_percpu_cgroup_storage_update(map, key, value,
|
||||
attr->flags);
|
||||
} else if (IS_FD_ARRAY(map)) {
|
||||
rcu_read_lock();
|
||||
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
|
||||
attr->flags);
|
||||
rcu_read_unlock();
|
||||
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
|
||||
rcu_read_lock();
|
||||
err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
|
||||
attr->flags);
|
||||
rcu_read_unlock();
|
||||
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
|
||||
/* rcu_read_lock() is not needed */
|
||||
err = bpf_fd_reuseport_array_update_elem(map, key, value,
|
||||
attr->flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
|
||||
map->map_type == BPF_MAP_TYPE_STACK) {
|
||||
err = map->ops->map_push_elem(map, value, attr->flags);
|
||||
} else {
|
||||
rcu_read_lock();
|
||||
err = map->ops->map_update_elem(map, key, value, attr->flags);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
__this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
maybe_wait_bpf_programs(map);
|
||||
out:
|
||||
free_value:
|
||||
kfree(value);
|
||||
free_key:
|
||||
@@ -1091,7 +1129,9 @@ static int map_delete_elem(union bpf_attr *attr)
|
||||
if (bpf_map_is_dev_bound(map)) {
|
||||
err = bpf_map_offload_delete_elem(map, key);
|
||||
goto out;
|
||||
} else if (IS_FD_PROG_ARRAY(map)) {
|
||||
} else if (IS_FD_PROG_ARRAY(map) ||
|
||||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
|
||||
/* These maps require sleepable context */
|
||||
err = map->ops->map_delete_elem(map, key);
|
||||
goto out;
|
||||
}
|
||||
@@ -1178,6 +1218,220 @@ err_put:
|
||||
return err;
|
||||
}
|
||||
|
||||
int generic_map_delete_batch(struct bpf_map *map,
|
||||
const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
void __user *keys = u64_to_user_ptr(attr->batch.keys);
|
||||
u32 cp, max_count;
|
||||
int err = 0;
|
||||
void *key;
|
||||
|
||||
if (attr->batch.elem_flags & ~BPF_F_LOCK)
|
||||
return -EINVAL;
|
||||
|
||||
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
|
||||
!map_value_has_spin_lock(map)) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
max_count = attr->batch.count;
|
||||
if (!max_count)
|
||||
return 0;
|
||||
|
||||
key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
|
||||
if (!key)
|
||||
return -ENOMEM;
|
||||
|
||||
for (cp = 0; cp < max_count; cp++) {
|
||||
err = -EFAULT;
|
||||
if (copy_from_user(key, keys + cp * map->key_size,
|
||||
map->key_size))
|
||||
break;
|
||||
|
||||
if (bpf_map_is_dev_bound(map)) {
|
||||
err = bpf_map_offload_delete_elem(map, key);
|
||||
break;
|
||||
}
|
||||
|
||||
preempt_disable();
|
||||
__this_cpu_inc(bpf_prog_active);
|
||||
rcu_read_lock();
|
||||
err = map->ops->map_delete_elem(map, key);
|
||||
rcu_read_unlock();
|
||||
__this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
maybe_wait_bpf_programs(map);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
|
||||
err = -EFAULT;
|
||||
|
||||
kfree(key);
|
||||
return err;
|
||||
}
|
||||
|
||||
int generic_map_update_batch(struct bpf_map *map,
|
||||
const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
void __user *values = u64_to_user_ptr(attr->batch.values);
|
||||
void __user *keys = u64_to_user_ptr(attr->batch.keys);
|
||||
u32 value_size, cp, max_count;
|
||||
int ufd = attr->map_fd;
|
||||
void *key, *value;
|
||||
struct fd f;
|
||||
int err = 0;
|
||||
|
||||
f = fdget(ufd);
|
||||
if (attr->batch.elem_flags & ~BPF_F_LOCK)
|
||||
return -EINVAL;
|
||||
|
||||
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
|
||||
!map_value_has_spin_lock(map)) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
value_size = bpf_map_value_size(map);
|
||||
|
||||
max_count = attr->batch.count;
|
||||
if (!max_count)
|
||||
return 0;
|
||||
|
||||
key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
|
||||
if (!key)
|
||||
return -ENOMEM;
|
||||
|
||||
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
|
||||
if (!value) {
|
||||
kfree(key);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (cp = 0; cp < max_count; cp++) {
|
||||
err = -EFAULT;
|
||||
if (copy_from_user(key, keys + cp * map->key_size,
|
||||
map->key_size) ||
|
||||
copy_from_user(value, values + cp * value_size, value_size))
|
||||
break;
|
||||
|
||||
err = bpf_map_update_value(map, f, key, value,
|
||||
attr->batch.elem_flags);
|
||||
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
|
||||
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
|
||||
err = -EFAULT;
|
||||
|
||||
kfree(value);
|
||||
kfree(key);
|
||||
return err;
|
||||
}
|
||||
|
||||
#define MAP_LOOKUP_RETRIES 3
|
||||
|
||||
int generic_map_lookup_batch(struct bpf_map *map,
|
||||
const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
|
||||
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
|
||||
void __user *values = u64_to_user_ptr(attr->batch.values);
|
||||
void __user *keys = u64_to_user_ptr(attr->batch.keys);
|
||||
void *buf, *buf_prevkey, *prev_key, *key, *value;
|
||||
int err, retry = MAP_LOOKUP_RETRIES;
|
||||
u32 value_size, cp, max_count;
|
||||
|
||||
if (attr->batch.elem_flags & ~BPF_F_LOCK)
|
||||
return -EINVAL;
|
||||
|
||||
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
|
||||
!map_value_has_spin_lock(map))
|
||||
return -EINVAL;
|
||||
|
||||
value_size = bpf_map_value_size(map);
|
||||
|
||||
max_count = attr->batch.count;
|
||||
if (!max_count)
|
||||
return 0;
|
||||
|
||||
if (put_user(0, &uattr->batch.count))
|
||||
return -EFAULT;
|
||||
|
||||
buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
|
||||
if (!buf_prevkey)
|
||||
return -ENOMEM;
|
||||
|
||||
buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
|
||||
if (!buf) {
|
||||
kvfree(buf_prevkey);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
err = -EFAULT;
|
||||
prev_key = NULL;
|
||||
if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
|
||||
goto free_buf;
|
||||
key = buf;
|
||||
value = key + map->key_size;
|
||||
if (ubatch)
|
||||
prev_key = buf_prevkey;
|
||||
|
||||
for (cp = 0; cp < max_count;) {
|
||||
rcu_read_lock();
|
||||
err = map->ops->map_get_next_key(map, prev_key, key);
|
||||
rcu_read_unlock();
|
||||
if (err)
|
||||
break;
|
||||
err = bpf_map_copy_value(map, key, value,
|
||||
attr->batch.elem_flags);
|
||||
|
||||
if (err == -ENOENT) {
|
||||
if (retry) {
|
||||
retry--;
|
||||
continue;
|
||||
}
|
||||
err = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
if (err)
|
||||
goto free_buf;
|
||||
|
||||
if (copy_to_user(keys + cp * map->key_size, key,
|
||||
map->key_size)) {
|
||||
err = -EFAULT;
|
||||
goto free_buf;
|
||||
}
|
||||
if (copy_to_user(values + cp * value_size, value, value_size)) {
|
||||
err = -EFAULT;
|
||||
goto free_buf;
|
||||
}
|
||||
|
||||
if (!prev_key)
|
||||
prev_key = buf_prevkey;
|
||||
|
||||
swap(prev_key, key);
|
||||
retry = MAP_LOOKUP_RETRIES;
|
||||
cp++;
|
||||
}
|
||||
|
||||
if (err == -EFAULT)
|
||||
goto free_buf;
|
||||
|
||||
if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
|
||||
(cp && copy_to_user(uobatch, prev_key, map->key_size))))
|
||||
err = -EFAULT;
|
||||
|
||||
free_buf:
|
||||
kfree(buf_prevkey);
|
||||
kfree(buf);
|
||||
return err;
|
||||
}
|
||||
|
||||
#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
|
||||
|
||||
static int map_lookup_and_delete_elem(union bpf_attr *attr)
|
||||
@@ -1306,6 +1560,36 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum bpf_audit {
|
||||
BPF_AUDIT_LOAD,
|
||||
BPF_AUDIT_UNLOAD,
|
||||
BPF_AUDIT_MAX,
|
||||
};
|
||||
|
||||
static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
|
||||
[BPF_AUDIT_LOAD] = "LOAD",
|
||||
[BPF_AUDIT_UNLOAD] = "UNLOAD",
|
||||
};
|
||||
|
||||
static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
|
||||
{
|
||||
struct audit_context *ctx = NULL;
|
||||
struct audit_buffer *ab;
|
||||
|
||||
if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
|
||||
return;
|
||||
if (audit_enabled == AUDIT_OFF)
|
||||
return;
|
||||
if (op == BPF_AUDIT_LOAD)
|
||||
ctx = audit_context();
|
||||
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
|
||||
if (unlikely(!ab))
|
||||
return;
|
||||
audit_log_format(ab, "prog-id=%u op=%s",
|
||||
prog->aux->id, bpf_audit_str[op]);
|
||||
audit_log_end(ab);
|
||||
}
|
||||
|
||||
int __bpf_prog_charge(struct user_struct *user, u32 pages)
|
||||
{
|
||||
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
||||
@@ -1421,6 +1705,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
|
||||
{
|
||||
if (atomic64_dec_and_test(&prog->aux->refcnt)) {
|
||||
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
|
||||
bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
|
||||
/* bpf_prog_free_id() must be called first */
|
||||
bpf_prog_free_id(prog, do_idr_lock);
|
||||
__bpf_prog_put_noref(prog, true);
|
||||
@@ -1640,17 +1925,24 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
|
||||
enum bpf_attach_type expected_attach_type,
|
||||
u32 btf_id, u32 prog_fd)
|
||||
{
|
||||
switch (prog_type) {
|
||||
case BPF_PROG_TYPE_TRACING:
|
||||
if (btf_id) {
|
||||
if (btf_id > BTF_MAX_TYPE)
|
||||
return -EINVAL;
|
||||
break;
|
||||
default:
|
||||
if (btf_id || prog_fd)
|
||||
|
||||
switch (prog_type) {
|
||||
case BPF_PROG_TYPE_TRACING:
|
||||
case BPF_PROG_TYPE_STRUCT_OPS:
|
||||
case BPF_PROG_TYPE_EXT:
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
|
||||
prog_type != BPF_PROG_TYPE_EXT)
|
||||
return -EINVAL;
|
||||
|
||||
switch (prog_type) {
|
||||
case BPF_PROG_TYPE_CGROUP_SOCK:
|
||||
switch (expected_attach_type) {
|
||||
@@ -1691,6 +1983,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
case BPF_PROG_TYPE_EXT:
|
||||
if (expected_attach_type)
|
||||
return -EINVAL;
|
||||
/* fallthrough */
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
@@ -1830,6 +2126,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
|
||||
*/
|
||||
bpf_prog_kallsyms_add(prog);
|
||||
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
|
||||
bpf_audit_prog(prog, BPF_AUDIT_LOAD);
|
||||
|
||||
err = bpf_prog_new_fd(prog);
|
||||
if (err < 0)
|
||||
@@ -1892,7 +2189,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
|
||||
int tr_fd, err;
|
||||
|
||||
if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
|
||||
prog->expected_attach_type != BPF_TRACE_FEXIT) {
|
||||
prog->expected_attach_type != BPF_TRACE_FEXIT &&
|
||||
prog->type != BPF_PROG_TYPE_EXT) {
|
||||
err = -EINVAL;
|
||||
goto out_put_prog;
|
||||
}
|
||||
@@ -1959,12 +2257,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
|
||||
|
||||
if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
|
||||
prog->type != BPF_PROG_TYPE_TRACING &&
|
||||
prog->type != BPF_PROG_TYPE_EXT &&
|
||||
prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
|
||||
err = -EINVAL;
|
||||
goto out_put_prog;
|
||||
}
|
||||
|
||||
if (prog->type == BPF_PROG_TYPE_TRACING) {
|
||||
if (prog->type == BPF_PROG_TYPE_TRACING ||
|
||||
prog->type == BPF_PROG_TYPE_EXT) {
|
||||
if (attr->raw_tracepoint.name) {
|
||||
/* The attach point for this category of programs
|
||||
* should be specified via btf_id during program load.
|
||||
@@ -2040,10 +2340,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
|
||||
}
|
||||
}
|
||||
|
||||
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
|
||||
#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
|
||||
|
||||
#define BPF_F_ATTACH_MASK \
|
||||
(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
|
||||
(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
|
||||
|
||||
static int bpf_prog_attach(const union bpf_attr *attr)
|
||||
{
|
||||
@@ -2305,6 +2605,23 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
|
||||
|
||||
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
|
||||
|
||||
struct bpf_prog *bpf_prog_by_id(u32 id)
|
||||
{
|
||||
struct bpf_prog *prog;
|
||||
|
||||
if (!id)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
spin_lock_bh(&prog_idr_lock);
|
||||
prog = idr_find(&prog_idr, id);
|
||||
if (prog)
|
||||
prog = bpf_prog_inc_not_zero(prog);
|
||||
else
|
||||
prog = ERR_PTR(-ENOENT);
|
||||
spin_unlock_bh(&prog_idr_lock);
|
||||
return prog;
|
||||
}
|
||||
|
||||
static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
|
||||
{
|
||||
struct bpf_prog *prog;
|
||||
@@ -2317,14 +2634,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
spin_lock_bh(&prog_idr_lock);
|
||||
prog = idr_find(&prog_idr, id);
|
||||
if (prog)
|
||||
prog = bpf_prog_inc_not_zero(prog);
|
||||
else
|
||||
prog = ERR_PTR(-ENOENT);
|
||||
spin_unlock_bh(&prog_idr_lock);
|
||||
|
||||
prog = bpf_prog_by_id(id);
|
||||
if (IS_ERR(prog))
|
||||
return PTR_ERR(prog);
|
||||
|
||||
@@ -2774,6 +3084,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
|
||||
info.btf_key_type_id = map->btf_key_type_id;
|
||||
info.btf_value_type_id = map->btf_value_type_id;
|
||||
}
|
||||
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
|
||||
|
||||
if (bpf_map_is_dev_bound(map)) {
|
||||
err = bpf_map_offload_info_fill(&info, map);
|
||||
@@ -2986,6 +3297,61 @@ out:
|
||||
return err;
|
||||
}
|
||||
|
||||
#define BPF_MAP_BATCH_LAST_FIELD batch.flags
|
||||
|
||||
#define BPF_DO_BATCH(fn) \
|
||||
do { \
|
||||
if (!fn) { \
|
||||
err = -ENOTSUPP; \
|
||||
goto err_put; \
|
||||
} \
|
||||
err = fn(map, attr, uattr); \
|
||||
} while (0)
|
||||
|
||||
static int bpf_map_do_batch(const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr,
|
||||
int cmd)
|
||||
{
|
||||
struct bpf_map *map;
|
||||
int err, ufd;
|
||||
struct fd f;
|
||||
|
||||
if (CHECK_ATTR(BPF_MAP_BATCH))
|
||||
return -EINVAL;
|
||||
|
||||
ufd = attr->batch.map_fd;
|
||||
f = fdget(ufd);
|
||||
map = __bpf_map_get(f);
|
||||
if (IS_ERR(map))
|
||||
return PTR_ERR(map);
|
||||
|
||||
if ((cmd == BPF_MAP_LOOKUP_BATCH ||
|
||||
cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
|
||||
!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
|
||||
err = -EPERM;
|
||||
goto err_put;
|
||||
}
|
||||
|
||||
if (cmd != BPF_MAP_LOOKUP_BATCH &&
|
||||
!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
|
||||
err = -EPERM;
|
||||
goto err_put;
|
||||
}
|
||||
|
||||
if (cmd == BPF_MAP_LOOKUP_BATCH)
|
||||
BPF_DO_BATCH(map->ops->map_lookup_batch);
|
||||
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
|
||||
BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
|
||||
else if (cmd == BPF_MAP_UPDATE_BATCH)
|
||||
BPF_DO_BATCH(map->ops->map_update_batch);
|
||||
else
|
||||
BPF_DO_BATCH(map->ops->map_delete_batch);
|
||||
|
||||
err_put:
|
||||
fdput(f);
|
||||
return err;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
|
||||
{
|
||||
union bpf_attr attr = {};
|
||||
@@ -3083,6 +3449,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
|
||||
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
|
||||
err = map_lookup_and_delete_elem(&attr);
|
||||
break;
|
||||
case BPF_MAP_LOOKUP_BATCH:
|
||||
err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
|
||||
break;
|
||||
case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
|
||||
err = bpf_map_do_batch(&attr, uattr,
|
||||
BPF_MAP_LOOKUP_AND_DELETE_BATCH);
|
||||
break;
|
||||
case BPF_MAP_UPDATE_BATCH:
|
||||
err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
|
||||
break;
|
||||
case BPF_MAP_DELETE_BATCH:
|
||||
err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
|
||||
break;
|
||||
default:
|
||||
err = -EINVAL;
|
||||
break;
|
||||
|
||||
@@ -4,16 +4,98 @@
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/filter.h>
|
||||
#include <linux/ftrace.h>
|
||||
#include <linux/rbtree_latch.h>
|
||||
|
||||
/* dummy _ops. The verifier will operate on target program's ops. */
|
||||
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
|
||||
};
|
||||
const struct bpf_prog_ops bpf_extension_prog_ops = {
|
||||
};
|
||||
|
||||
/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
|
||||
#define TRAMPOLINE_HASH_BITS 10
|
||||
#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
|
||||
|
||||
static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
|
||||
static struct latch_tree_root image_tree __cacheline_aligned;
|
||||
|
||||
/* serializes access to trampoline_table */
|
||||
/* serializes access to trampoline_table and image_tree */
|
||||
static DEFINE_MUTEX(trampoline_mutex);
|
||||
|
||||
static void *bpf_jit_alloc_exec_page(void)
|
||||
{
|
||||
void *image;
|
||||
|
||||
image = bpf_jit_alloc_exec(PAGE_SIZE);
|
||||
if (!image)
|
||||
return NULL;
|
||||
|
||||
set_vm_flush_reset_perms(image);
|
||||
/* Keep image as writeable. The alternative is to keep flipping ro/rw
|
||||
* everytime new program is attached or detached.
|
||||
*/
|
||||
set_memory_x((long)image, 1);
|
||||
return image;
|
||||
}
|
||||
|
||||
static __always_inline bool image_tree_less(struct latch_tree_node *a,
|
||||
struct latch_tree_node *b)
|
||||
{
|
||||
struct bpf_image *ia = container_of(a, struct bpf_image, tnode);
|
||||
struct bpf_image *ib = container_of(b, struct bpf_image, tnode);
|
||||
|
||||
return ia < ib;
|
||||
}
|
||||
|
||||
static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n)
|
||||
{
|
||||
void *image = container_of(n, struct bpf_image, tnode);
|
||||
|
||||
if (addr < image)
|
||||
return -1;
|
||||
if (addr >= image + PAGE_SIZE)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct latch_tree_ops image_tree_ops = {
|
||||
.less = image_tree_less,
|
||||
.comp = image_tree_comp,
|
||||
};
|
||||
|
||||
static void *__bpf_image_alloc(bool lock)
|
||||
{
|
||||
struct bpf_image *image;
|
||||
|
||||
image = bpf_jit_alloc_exec_page();
|
||||
if (!image)
|
||||
return NULL;
|
||||
|
||||
if (lock)
|
||||
mutex_lock(&trampoline_mutex);
|
||||
latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops);
|
||||
if (lock)
|
||||
mutex_unlock(&trampoline_mutex);
|
||||
return image->data;
|
||||
}
|
||||
|
||||
void *bpf_image_alloc(void)
|
||||
{
|
||||
return __bpf_image_alloc(true);
|
||||
}
|
||||
|
||||
bool is_bpf_image_address(unsigned long addr)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
rcu_read_lock();
|
||||
ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL;
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
|
||||
{
|
||||
struct bpf_trampoline *tr;
|
||||
@@ -34,7 +116,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
|
||||
goto out;
|
||||
|
||||
/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
|
||||
image = bpf_jit_alloc_exec(PAGE_SIZE);
|
||||
image = __bpf_image_alloc(false);
|
||||
if (!image) {
|
||||
kfree(tr);
|
||||
tr = NULL;
|
||||
@@ -48,12 +130,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
|
||||
mutex_init(&tr->mutex);
|
||||
for (i = 0; i < BPF_TRAMP_MAX; i++)
|
||||
INIT_HLIST_HEAD(&tr->progs_hlist[i]);
|
||||
|
||||
set_vm_flush_reset_perms(image);
|
||||
/* Keep image as writeable. The alternative is to keep flipping ro/rw
|
||||
* everytime new program is attached or detached.
|
||||
*/
|
||||
set_memory_x((long)image, 1);
|
||||
tr->image = image;
|
||||
out:
|
||||
mutex_unlock(&trampoline_mutex);
|
||||
@@ -115,14 +191,14 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
|
||||
}
|
||||
|
||||
/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
|
||||
* bytes on x86. Pick a number to fit into PAGE_SIZE / 2
|
||||
* bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2
|
||||
*/
|
||||
#define BPF_MAX_TRAMP_PROGS 40
|
||||
|
||||
static int bpf_trampoline_update(struct bpf_trampoline *tr)
|
||||
{
|
||||
void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
|
||||
void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
|
||||
void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2;
|
||||
void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2;
|
||||
struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
|
||||
int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
|
||||
int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
|
||||
@@ -150,11 +226,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
|
||||
if (fexit_cnt)
|
||||
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
|
||||
|
||||
err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags,
|
||||
/* Though the second half of trampoline page is unused a task could be
|
||||
* preempted in the middle of the first half of trampoline and two
|
||||
* updates to trampoline would change the code from underneath the
|
||||
* preempted task. Hence wait for tasks to voluntarily schedule or go
|
||||
* to userspace.
|
||||
*/
|
||||
synchronize_rcu_tasks();
|
||||
|
||||
err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2,
|
||||
&tr->func.model, flags,
|
||||
fentry, fentry_cnt,
|
||||
fexit, fexit_cnt,
|
||||
tr->func.addr);
|
||||
if (err)
|
||||
if (err < 0)
|
||||
goto out;
|
||||
|
||||
if (tr->selector)
|
||||
@@ -175,8 +260,10 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
|
||||
switch (t) {
|
||||
case BPF_TRACE_FENTRY:
|
||||
return BPF_TRAMP_FENTRY;
|
||||
default:
|
||||
case BPF_TRACE_FEXIT:
|
||||
return BPF_TRAMP_FEXIT;
|
||||
default:
|
||||
return BPF_TRAMP_REPLACE;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -185,12 +272,31 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog)
|
||||
enum bpf_tramp_prog_type kind;
|
||||
struct bpf_trampoline *tr;
|
||||
int err = 0;
|
||||
int cnt;
|
||||
|
||||
tr = prog->aux->trampoline;
|
||||
kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
|
||||
mutex_lock(&tr->mutex);
|
||||
if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]
|
||||
>= BPF_MAX_TRAMP_PROGS) {
|
||||
if (tr->extension_prog) {
|
||||
/* cannot attach fentry/fexit if extension prog is attached.
|
||||
* cannot overwrite extension prog either.
|
||||
*/
|
||||
err = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
|
||||
if (kind == BPF_TRAMP_REPLACE) {
|
||||
/* Cannot attach extension if fentry/fexit are in use. */
|
||||
if (cnt) {
|
||||
err = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
tr->extension_prog = prog;
|
||||
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
|
||||
prog->bpf_func);
|
||||
goto out;
|
||||
}
|
||||
if (cnt >= BPF_MAX_TRAMP_PROGS) {
|
||||
err = -E2BIG;
|
||||
goto out;
|
||||
}
|
||||
@@ -221,15 +327,25 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
|
||||
tr = prog->aux->trampoline;
|
||||
kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
|
||||
mutex_lock(&tr->mutex);
|
||||
if (kind == BPF_TRAMP_REPLACE) {
|
||||
WARN_ON_ONCE(!tr->extension_prog);
|
||||
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
|
||||
tr->extension_prog->bpf_func, NULL);
|
||||
tr->extension_prog = NULL;
|
||||
goto out;
|
||||
}
|
||||
hlist_del(&prog->aux->tramp_hlist);
|
||||
tr->progs_cnt[kind]--;
|
||||
err = bpf_trampoline_update(prog->aux->trampoline);
|
||||
out:
|
||||
mutex_unlock(&tr->mutex);
|
||||
return err;
|
||||
}
|
||||
|
||||
void bpf_trampoline_put(struct bpf_trampoline *tr)
|
||||
{
|
||||
struct bpf_image *image;
|
||||
|
||||
if (!tr)
|
||||
return;
|
||||
mutex_lock(&trampoline_mutex);
|
||||
@@ -240,7 +356,11 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
|
||||
goto out;
|
||||
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
|
||||
goto out;
|
||||
bpf_jit_free_exec(tr->image);
|
||||
image = container_of(tr->image, struct bpf_image, data);
|
||||
latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops);
|
||||
/* wait for tasks to get out of trampoline before freeing it */
|
||||
synchronize_rcu_tasks();
|
||||
bpf_jit_free_exec(image);
|
||||
hlist_del(&tr->hlist);
|
||||
kfree(tr);
|
||||
out:
|
||||
@@ -286,7 +406,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
|
||||
}
|
||||
|
||||
int __weak
|
||||
arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
|
||||
arch_prepare_bpf_trampoline(void *image, void *image_end,
|
||||
const struct btf_func_model *m, u32 flags,
|
||||
struct bpf_prog **fentry_progs, int fentry_cnt,
|
||||
struct bpf_prog **fexit_progs, int fexit_cnt,
|
||||
void *orig_call)
|
||||
|
||||
@@ -1122,10 +1122,6 @@ static void init_reg_state(struct bpf_verifier_env *env,
|
||||
regs[BPF_REG_FP].type = PTR_TO_STACK;
|
||||
mark_reg_known_zero(env, regs, BPF_REG_FP);
|
||||
regs[BPF_REG_FP].frameno = state->frameno;
|
||||
|
||||
/* 1st arg to a function */
|
||||
regs[BPF_REG_1].type = PTR_TO_CTX;
|
||||
mark_reg_known_zero(env, regs, BPF_REG_1);
|
||||
}
|
||||
|
||||
#define BPF_MAIN_FUNC (-1)
|
||||
@@ -1916,6 +1912,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
|
||||
case PTR_TO_TCP_SOCK:
|
||||
case PTR_TO_TCP_SOCK_OR_NULL:
|
||||
case PTR_TO_XDP_SOCK:
|
||||
case PTR_TO_BTF_ID:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -2738,8 +2735,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
|
||||
}
|
||||
#endif
|
||||
|
||||
static int check_ctx_reg(struct bpf_verifier_env *env,
|
||||
const struct bpf_reg_state *reg, int regno)
|
||||
int check_ctx_reg(struct bpf_verifier_env *env,
|
||||
const struct bpf_reg_state *reg, int regno)
|
||||
{
|
||||
/* Access to ctx or passing it to a helper is only allowed in
|
||||
* its original, unmodified form.
|
||||
@@ -2858,11 +2855,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
|
||||
u32 btf_id;
|
||||
int ret;
|
||||
|
||||
if (atype != BPF_READ) {
|
||||
verbose(env, "only read is supported\n");
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
if (off < 0) {
|
||||
verbose(env,
|
||||
"R%d is ptr_%s invalid negative access: off=%d\n",
|
||||
@@ -2879,17 +2871,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
|
||||
if (env->ops->btf_struct_access) {
|
||||
ret = env->ops->btf_struct_access(&env->log, t, off, size,
|
||||
atype, &btf_id);
|
||||
} else {
|
||||
if (atype != BPF_READ) {
|
||||
verbose(env, "only read is supported\n");
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
ret = btf_struct_access(&env->log, t, off, size, atype,
|
||||
&btf_id);
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (ret == SCALAR_VALUE) {
|
||||
mark_reg_unknown(env, regs, value_regno);
|
||||
return 0;
|
||||
if (atype == BPF_READ) {
|
||||
if (ret == SCALAR_VALUE) {
|
||||
mark_reg_unknown(env, regs, value_regno);
|
||||
return 0;
|
||||
}
|
||||
mark_reg_known_zero(env, regs, value_regno);
|
||||
regs[value_regno].type = PTR_TO_BTF_ID;
|
||||
regs[value_regno].btf_id = btf_id;
|
||||
}
|
||||
mark_reg_known_zero(env, regs, value_regno);
|
||||
regs[value_regno].type = PTR_TO_BTF_ID;
|
||||
regs[value_regno].btf_id = btf_id;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -3945,12 +3952,26 @@ static int release_reference(struct bpf_verifier_env *env,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void clear_caller_saved_regs(struct bpf_verifier_env *env,
|
||||
struct bpf_reg_state *regs)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* after the call registers r0 - r5 were scratched */
|
||||
for (i = 0; i < CALLER_SAVED_REGS; i++) {
|
||||
mark_reg_not_init(env, regs, caller_saved[i]);
|
||||
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
|
||||
}
|
||||
}
|
||||
|
||||
static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
|
||||
int *insn_idx)
|
||||
{
|
||||
struct bpf_verifier_state *state = env->cur_state;
|
||||
struct bpf_func_info_aux *func_info_aux;
|
||||
struct bpf_func_state *caller, *callee;
|
||||
int i, err, subprog, target_insn;
|
||||
bool is_global = false;
|
||||
|
||||
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
|
||||
verbose(env, "the call stack of %d frames is too deep\n",
|
||||
@@ -3973,6 +3994,32 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
func_info_aux = env->prog->aux->func_info_aux;
|
||||
if (func_info_aux)
|
||||
is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
|
||||
err = btf_check_func_arg_match(env, subprog, caller->regs);
|
||||
if (err == -EFAULT)
|
||||
return err;
|
||||
if (is_global) {
|
||||
if (err) {
|
||||
verbose(env, "Caller passes invalid args into func#%d\n",
|
||||
subprog);
|
||||
return err;
|
||||
} else {
|
||||
if (env->log.level & BPF_LOG_LEVEL)
|
||||
verbose(env,
|
||||
"Func#%d is global and valid. Skipping.\n",
|
||||
subprog);
|
||||
clear_caller_saved_regs(env, caller->regs);
|
||||
|
||||
/* All global functions return SCALAR_VALUE */
|
||||
mark_reg_unknown(env, caller->regs, BPF_REG_0);
|
||||
|
||||
/* continue with next insn after call */
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
|
||||
if (!callee)
|
||||
return -ENOMEM;
|
||||
@@ -3999,18 +4046,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
|
||||
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
|
||||
callee->regs[i] = caller->regs[i];
|
||||
|
||||
/* after the call registers r0 - r5 were scratched */
|
||||
for (i = 0; i < CALLER_SAVED_REGS; i++) {
|
||||
mark_reg_not_init(env, caller->regs, caller_saved[i]);
|
||||
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
|
||||
}
|
||||
clear_caller_saved_regs(env, caller->regs);
|
||||
|
||||
/* only increment it after check_reg_arg() finished */
|
||||
state->curframe++;
|
||||
|
||||
if (btf_check_func_arg_match(env, subprog))
|
||||
return -EINVAL;
|
||||
|
||||
/* and go analyze first insn of the callee */
|
||||
*insn_idx = target_insn;
|
||||
|
||||
@@ -6360,8 +6400,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
|
||||
static int check_return_code(struct bpf_verifier_env *env)
|
||||
{
|
||||
struct tnum enforce_attach_type_range = tnum_unknown;
|
||||
const struct bpf_prog *prog = env->prog;
|
||||
struct bpf_reg_state *reg;
|
||||
struct tnum range = tnum_range(0, 1);
|
||||
int err;
|
||||
|
||||
/* The struct_ops func-ptr's return type could be "void" */
|
||||
if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
|
||||
!prog->aux->attach_func_proto->type)
|
||||
return 0;
|
||||
|
||||
/* eBPF calling convetion is such that R0 is used
|
||||
* to return the value from eBPF program.
|
||||
* Make sure that it's readable at this time
|
||||
* of bpf_exit, which means that program wrote
|
||||
* something into it earlier
|
||||
*/
|
||||
err = check_reg_arg(env, BPF_REG_0, SRC_OP);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (is_pointer_value(env, BPF_REG_0)) {
|
||||
verbose(env, "R0 leaks addr as return value\n");
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
switch (env->prog->type) {
|
||||
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
|
||||
@@ -6750,12 +6812,13 @@ static int check_btf_func(struct bpf_verifier_env *env,
|
||||
|
||||
/* check type_id */
|
||||
type = btf_type_by_id(btf, krecord[i].type_id);
|
||||
if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
|
||||
if (!type || !btf_type_is_func(type)) {
|
||||
verbose(env, "invalid type id %d in func info",
|
||||
krecord[i].type_id);
|
||||
ret = -EINVAL;
|
||||
goto err_free;
|
||||
}
|
||||
info_aux[i].linkage = BTF_INFO_VLEN(type->info);
|
||||
prev_offset = krecord[i].insn_off;
|
||||
urecord += urec_size;
|
||||
}
|
||||
@@ -7735,35 +7798,13 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
|
||||
|
||||
static int do_check(struct bpf_verifier_env *env)
|
||||
{
|
||||
struct bpf_verifier_state *state;
|
||||
struct bpf_verifier_state *state = env->cur_state;
|
||||
struct bpf_insn *insns = env->prog->insnsi;
|
||||
struct bpf_reg_state *regs;
|
||||
int insn_cnt = env->prog->len;
|
||||
bool do_print_state = false;
|
||||
int prev_insn_idx = -1;
|
||||
|
||||
env->prev_linfo = NULL;
|
||||
|
||||
state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
|
||||
if (!state)
|
||||
return -ENOMEM;
|
||||
state->curframe = 0;
|
||||
state->speculative = false;
|
||||
state->branches = 1;
|
||||
state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
|
||||
if (!state->frame[0]) {
|
||||
kfree(state);
|
||||
return -ENOMEM;
|
||||
}
|
||||
env->cur_state = state;
|
||||
init_func_state(env, state->frame[0],
|
||||
BPF_MAIN_FUNC /* callsite */,
|
||||
0 /* frameno */,
|
||||
0 /* subprogno, zero == main subprog */);
|
||||
|
||||
if (btf_check_func_arg_match(env, 0))
|
||||
return -EINVAL;
|
||||
|
||||
for (;;) {
|
||||
struct bpf_insn *insn;
|
||||
u8 class;
|
||||
@@ -7841,7 +7882,7 @@ static int do_check(struct bpf_verifier_env *env)
|
||||
}
|
||||
|
||||
regs = cur_regs(env);
|
||||
env->insn_aux_data[env->insn_idx].seen = true;
|
||||
env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
|
||||
prev_insn_idx = env->insn_idx;
|
||||
|
||||
if (class == BPF_ALU || class == BPF_ALU64) {
|
||||
@@ -8027,21 +8068,6 @@ static int do_check(struct bpf_verifier_env *env)
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/* eBPF calling convetion is such that R0 is used
|
||||
* to return the value from eBPF program.
|
||||
* Make sure that it's readable at this time
|
||||
* of bpf_exit, which means that program wrote
|
||||
* something into it earlier
|
||||
*/
|
||||
err = check_reg_arg(env, BPF_REG_0, SRC_OP);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (is_pointer_value(env, BPF_REG_0)) {
|
||||
verbose(env, "R0 leaks addr as return value\n");
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
err = check_return_code(env);
|
||||
if (err)
|
||||
return err;
|
||||
@@ -8076,7 +8102,7 @@ process_bpf_exit:
|
||||
return err;
|
||||
|
||||
env->insn_idx++;
|
||||
env->insn_aux_data[env->insn_idx].seen = true;
|
||||
env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
|
||||
} else {
|
||||
verbose(env, "invalid BPF_LD mode\n");
|
||||
return -EINVAL;
|
||||
@@ -8089,7 +8115,6 @@ process_bpf_exit:
|
||||
env->insn_idx++;
|
||||
}
|
||||
|
||||
env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -8149,6 +8174,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
|
||||
verbose(env, "bpf_struct_ops map cannot be used in prog\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -8361,7 +8391,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
|
||||
memcpy(new_data + off + cnt - 1, old_data + off,
|
||||
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
|
||||
for (i = off; i < off + cnt - 1; i++) {
|
||||
new_data[i].seen = true;
|
||||
new_data[i].seen = env->pass_cnt;
|
||||
new_data[i].zext_dst = insn_has_def32(env, insn + i);
|
||||
}
|
||||
env->insn_aux_data = new_data;
|
||||
@@ -8840,12 +8870,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
|
||||
convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
|
||||
break;
|
||||
case PTR_TO_BTF_ID:
|
||||
if (type == BPF_WRITE) {
|
||||
if (type == BPF_READ) {
|
||||
insn->code = BPF_LDX | BPF_PROBE_MEM |
|
||||
BPF_SIZE((insn)->code);
|
||||
env->prog->aux->num_exentries++;
|
||||
} else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) {
|
||||
verbose(env, "Writes through BTF pointers are not allowed\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code);
|
||||
env->prog->aux->num_exentries++;
|
||||
continue;
|
||||
default:
|
||||
continue;
|
||||
@@ -9425,6 +9457,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
|
||||
goto patch_call_imm;
|
||||
}
|
||||
|
||||
if (prog->jit_requested && BITS_PER_LONG == 64 &&
|
||||
insn->imm == BPF_FUNC_jiffies64) {
|
||||
struct bpf_insn ld_jiffies_addr[2] = {
|
||||
BPF_LD_IMM64(BPF_REG_0,
|
||||
(unsigned long)&jiffies),
|
||||
};
|
||||
|
||||
insn_buf[0] = ld_jiffies_addr[0];
|
||||
insn_buf[1] = ld_jiffies_addr[1];
|
||||
insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
|
||||
BPF_REG_0, 0);
|
||||
cnt = 3;
|
||||
|
||||
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
|
||||
cnt);
|
||||
if (!new_prog)
|
||||
return -ENOMEM;
|
||||
|
||||
delta += cnt - 1;
|
||||
env->prog = prog = new_prog;
|
||||
insn = new_prog->insnsi + i + delta;
|
||||
continue;
|
||||
}
|
||||
|
||||
patch_call_imm:
|
||||
fn = env->ops->get_func_proto(insn->imm, env->prog);
|
||||
/* all functions that have prototype and verifier allowed
|
||||
@@ -9471,6 +9527,7 @@ static void free_states(struct bpf_verifier_env *env)
|
||||
kfree(sl);
|
||||
sl = sln;
|
||||
}
|
||||
env->free_list = NULL;
|
||||
|
||||
if (!env->explored_states)
|
||||
return;
|
||||
@@ -9484,11 +9541,164 @@ static void free_states(struct bpf_verifier_env *env)
|
||||
kfree(sl);
|
||||
sl = sln;
|
||||
}
|
||||
env->explored_states[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* The verifier is using insn_aux_data[] to store temporary data during
|
||||
* verification and to store information for passes that run after the
|
||||
* verification like dead code sanitization. do_check_common() for subprogram N
|
||||
* may analyze many other subprograms. sanitize_insn_aux_data() clears all
|
||||
* temporary data after do_check_common() finds that subprogram N cannot be
|
||||
* verified independently. pass_cnt counts the number of times
|
||||
* do_check_common() was run and insn->aux->seen tells the pass number
|
||||
* insn_aux_data was touched. These variables are compared to clear temporary
|
||||
* data from failed pass. For testing and experiments do_check_common() can be
|
||||
* run multiple times even when prior attempt to verify is unsuccessful.
|
||||
*/
|
||||
static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
|
||||
{
|
||||
struct bpf_insn *insn = env->prog->insnsi;
|
||||
struct bpf_insn_aux_data *aux;
|
||||
int i, class;
|
||||
|
||||
for (i = 0; i < env->prog->len; i++) {
|
||||
class = BPF_CLASS(insn[i].code);
|
||||
if (class != BPF_LDX && class != BPF_STX)
|
||||
continue;
|
||||
aux = &env->insn_aux_data[i];
|
||||
if (aux->seen != env->pass_cnt)
|
||||
continue;
|
||||
memset(aux, 0, offsetof(typeof(*aux), orig_idx));
|
||||
}
|
||||
}
|
||||
|
||||
static int do_check_common(struct bpf_verifier_env *env, int subprog)
|
||||
{
|
||||
struct bpf_verifier_state *state;
|
||||
struct bpf_reg_state *regs;
|
||||
int ret, i;
|
||||
|
||||
env->prev_linfo = NULL;
|
||||
env->pass_cnt++;
|
||||
|
||||
state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
|
||||
if (!state)
|
||||
return -ENOMEM;
|
||||
state->curframe = 0;
|
||||
state->speculative = false;
|
||||
state->branches = 1;
|
||||
state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
|
||||
if (!state->frame[0]) {
|
||||
kfree(state);
|
||||
return -ENOMEM;
|
||||
}
|
||||
env->cur_state = state;
|
||||
init_func_state(env, state->frame[0],
|
||||
BPF_MAIN_FUNC /* callsite */,
|
||||
0 /* frameno */,
|
||||
subprog);
|
||||
|
||||
regs = state->frame[state->curframe]->regs;
|
||||
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
|
||||
ret = btf_prepare_func_args(env, subprog, regs);
|
||||
if (ret)
|
||||
goto out;
|
||||
for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
|
||||
if (regs[i].type == PTR_TO_CTX)
|
||||
mark_reg_known_zero(env, regs, i);
|
||||
else if (regs[i].type == SCALAR_VALUE)
|
||||
mark_reg_unknown(env, regs, i);
|
||||
}
|
||||
} else {
|
||||
/* 1st arg to a function */
|
||||
regs[BPF_REG_1].type = PTR_TO_CTX;
|
||||
mark_reg_known_zero(env, regs, BPF_REG_1);
|
||||
ret = btf_check_func_arg_match(env, subprog, regs);
|
||||
if (ret == -EFAULT)
|
||||
/* unlikely verifier bug. abort.
|
||||
* ret == 0 and ret < 0 are sadly acceptable for
|
||||
* main() function due to backward compatibility.
|
||||
* Like socket filter program may be written as:
|
||||
* int bpf_prog(struct pt_regs *ctx)
|
||||
* and never dereference that ctx in the program.
|
||||
* 'struct pt_regs' is a type mismatch for socket
|
||||
* filter that should be using 'struct __sk_buff'.
|
||||
*/
|
||||
goto out;
|
||||
}
|
||||
|
||||
kvfree(env->explored_states);
|
||||
ret = do_check(env);
|
||||
out:
|
||||
/* check for NULL is necessary, since cur_state can be freed inside
|
||||
* do_check() under memory pressure.
|
||||
*/
|
||||
if (env->cur_state) {
|
||||
free_verifier_state(env->cur_state, true);
|
||||
env->cur_state = NULL;
|
||||
}
|
||||
while (!pop_stack(env, NULL, NULL));
|
||||
free_states(env);
|
||||
if (ret)
|
||||
/* clean aux data in case subprog was rejected */
|
||||
sanitize_insn_aux_data(env);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Verify all global functions in a BPF program one by one based on their BTF.
|
||||
* All global functions must pass verification. Otherwise the whole program is rejected.
|
||||
* Consider:
|
||||
* int bar(int);
|
||||
* int foo(int f)
|
||||
* {
|
||||
* return bar(f);
|
||||
* }
|
||||
* int bar(int b)
|
||||
* {
|
||||
* ...
|
||||
* }
|
||||
* foo() will be verified first for R1=any_scalar_value. During verification it
|
||||
* will be assumed that bar() already verified successfully and call to bar()
|
||||
* from foo() will be checked for type match only. Later bar() will be verified
|
||||
* independently to check that it's safe for R1=any_scalar_value.
|
||||
*/
|
||||
static int do_check_subprogs(struct bpf_verifier_env *env)
|
||||
{
|
||||
struct bpf_prog_aux *aux = env->prog->aux;
|
||||
int i, ret;
|
||||
|
||||
if (!aux->func_info)
|
||||
return 0;
|
||||
|
||||
for (i = 1; i < env->subprog_cnt; i++) {
|
||||
if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
|
||||
continue;
|
||||
env->insn_idx = env->subprog_info[i].start;
|
||||
WARN_ON_ONCE(env->insn_idx == 0);
|
||||
ret = do_check_common(env, i);
|
||||
if (ret) {
|
||||
return ret;
|
||||
} else if (env->log.level & BPF_LOG_LEVEL) {
|
||||
verbose(env,
|
||||
"Func#%d is safe for any args that match its prototype\n",
|
||||
i);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int do_check_main(struct bpf_verifier_env *env)
|
||||
{
|
||||
int ret;
|
||||
|
||||
env->insn_idx = 0;
|
||||
ret = do_check_common(env, 0);
|
||||
if (!ret)
|
||||
env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static void print_verification_stats(struct bpf_verifier_env *env)
|
||||
{
|
||||
int i;
|
||||
@@ -9513,9 +9723,62 @@ static void print_verification_stats(struct bpf_verifier_env *env)
|
||||
env->peak_states, env->longest_mark_read_walk);
|
||||
}
|
||||
|
||||
static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
|
||||
{
|
||||
const struct btf_type *t, *func_proto;
|
||||
const struct bpf_struct_ops *st_ops;
|
||||
const struct btf_member *member;
|
||||
struct bpf_prog *prog = env->prog;
|
||||
u32 btf_id, member_idx;
|
||||
const char *mname;
|
||||
|
||||
btf_id = prog->aux->attach_btf_id;
|
||||
st_ops = bpf_struct_ops_find(btf_id);
|
||||
if (!st_ops) {
|
||||
verbose(env, "attach_btf_id %u is not a supported struct\n",
|
||||
btf_id);
|
||||
return -ENOTSUPP;
|
||||
}
|
||||
|
||||
t = st_ops->type;
|
||||
member_idx = prog->expected_attach_type;
|
||||
if (member_idx >= btf_type_vlen(t)) {
|
||||
verbose(env, "attach to invalid member idx %u of struct %s\n",
|
||||
member_idx, st_ops->name);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
member = &btf_type_member(t)[member_idx];
|
||||
mname = btf_name_by_offset(btf_vmlinux, member->name_off);
|
||||
func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
|
||||
NULL);
|
||||
if (!func_proto) {
|
||||
verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
|
||||
mname, member_idx, st_ops->name);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (st_ops->check_member) {
|
||||
int err = st_ops->check_member(t, member);
|
||||
|
||||
if (err) {
|
||||
verbose(env, "attach to unsupported member %s of struct %s\n",
|
||||
mname, st_ops->name);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
prog->aux->attach_func_proto = func_proto;
|
||||
prog->aux->attach_func_name = mname;
|
||||
env->ops = st_ops->verifier_ops;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int check_attach_btf_id(struct bpf_verifier_env *env)
|
||||
{
|
||||
struct bpf_prog *prog = env->prog;
|
||||
bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
|
||||
struct bpf_prog *tgt_prog = prog->aux->linked_prog;
|
||||
u32 btf_id = prog->aux->attach_btf_id;
|
||||
const char prefix[] = "btf_trace_";
|
||||
@@ -9528,7 +9791,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
|
||||
long addr;
|
||||
u64 key;
|
||||
|
||||
if (prog->type != BPF_PROG_TYPE_TRACING)
|
||||
if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
|
||||
return check_struct_ops_btf_id(env);
|
||||
|
||||
if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension)
|
||||
return 0;
|
||||
|
||||
if (!btf_id) {
|
||||
@@ -9564,8 +9830,59 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
|
||||
return -EINVAL;
|
||||
}
|
||||
conservative = aux->func_info_aux[subprog].unreliable;
|
||||
if (prog_extension) {
|
||||
if (conservative) {
|
||||
verbose(env,
|
||||
"Cannot replace static functions\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (!prog->jit_requested) {
|
||||
verbose(env,
|
||||
"Extension programs should be JITed\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
env->ops = bpf_verifier_ops[tgt_prog->type];
|
||||
}
|
||||
if (!tgt_prog->jited) {
|
||||
verbose(env, "Can attach to only JITed progs\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (tgt_prog->type == prog->type) {
|
||||
/* Cannot fentry/fexit another fentry/fexit program.
|
||||
* Cannot attach program extension to another extension.
|
||||
* It's ok to attach fentry/fexit to extension program.
|
||||
*/
|
||||
verbose(env, "Cannot recursively attach\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
|
||||
prog_extension &&
|
||||
(tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
|
||||
tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
|
||||
/* Program extensions can extend all program types
|
||||
* except fentry/fexit. The reason is the following.
|
||||
* The fentry/fexit programs are used for performance
|
||||
* analysis, stats and can be attached to any program
|
||||
* type except themselves. When extension program is
|
||||
* replacing XDP function it is necessary to allow
|
||||
* performance analysis of all functions. Both original
|
||||
* XDP program and its program extension. Hence
|
||||
* attaching fentry/fexit to BPF_PROG_TYPE_EXT is
|
||||
* allowed. If extending of fentry/fexit was allowed it
|
||||
* would be possible to create long call chain
|
||||
* fentry->extension->fentry->extension beyond
|
||||
* reasonable stack size. Hence extending fentry is not
|
||||
* allowed.
|
||||
*/
|
||||
verbose(env, "Cannot extend fentry/fexit\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
key = ((u64)aux->id) << 32 | btf_id;
|
||||
} else {
|
||||
if (prog_extension) {
|
||||
verbose(env, "Cannot replace kernel functions\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
key = btf_id;
|
||||
}
|
||||
|
||||
@@ -9603,6 +9920,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
|
||||
prog->aux->attach_func_proto = t;
|
||||
prog->aux->attach_btf_trace = true;
|
||||
return 0;
|
||||
default:
|
||||
if (!prog_extension)
|
||||
return -EINVAL;
|
||||
/* fallthrough */
|
||||
case BPF_TRACE_FENTRY:
|
||||
case BPF_TRACE_FEXIT:
|
||||
if (!btf_type_is_func(t)) {
|
||||
@@ -9610,6 +9931,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
|
||||
btf_id);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (prog_extension &&
|
||||
btf_check_type_match(env, prog, btf, t))
|
||||
return -EINVAL;
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
if (!btf_type_is_func_proto(t))
|
||||
return -EINVAL;
|
||||
@@ -9633,18 +9957,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
if (tgt_prog) {
|
||||
if (!tgt_prog->jited) {
|
||||
/* for now */
|
||||
verbose(env, "Can trace only JITed BPF progs\n");
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
if (tgt_prog->type == BPF_PROG_TYPE_TRACING) {
|
||||
/* prevent cycles */
|
||||
verbose(env, "Cannot recursively attach\n");
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
if (subprog == 0)
|
||||
addr = (long) tgt_prog->bpf_func;
|
||||
else
|
||||
@@ -9666,8 +9978,6 @@ out:
|
||||
if (ret)
|
||||
bpf_trampoline_put(tr);
|
||||
return ret;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9737,10 +10047,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
|
||||
goto skip_full_check;
|
||||
}
|
||||
|
||||
ret = check_attach_btf_id(env);
|
||||
if (ret)
|
||||
goto skip_full_check;
|
||||
|
||||
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
|
||||
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
|
||||
env->strict_alignment = true;
|
||||
@@ -9777,22 +10083,22 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
|
||||
if (ret < 0)
|
||||
goto skip_full_check;
|
||||
|
||||
ret = check_attach_btf_id(env);
|
||||
if (ret)
|
||||
goto skip_full_check;
|
||||
|
||||
ret = check_cfg(env);
|
||||
if (ret < 0)
|
||||
goto skip_full_check;
|
||||
|
||||
ret = do_check(env);
|
||||
if (env->cur_state) {
|
||||
free_verifier_state(env->cur_state, true);
|
||||
env->cur_state = NULL;
|
||||
}
|
||||
ret = do_check_subprogs(env);
|
||||
ret = ret ?: do_check_main(env);
|
||||
|
||||
if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
|
||||
ret = bpf_prog_offload_finalize(env);
|
||||
|
||||
skip_full_check:
|
||||
while (!pop_stack(env, NULL, NULL));
|
||||
free_states(env);
|
||||
kvfree(env->explored_states);
|
||||
|
||||
if (ret == 0)
|
||||
ret = check_max_stack_depth(env);
|
||||
|
||||
@@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs,
|
||||
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
struct bpf_map_memory mem;
|
||||
int cpu, err, numa_node;
|
||||
int err, numa_node;
|
||||
struct xsk_map *m;
|
||||
u64 cost, size;
|
||||
u64 size;
|
||||
|
||||
if (!capable(CAP_NET_ADMIN))
|
||||
return ERR_PTR(-EPERM);
|
||||
@@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
|
||||
|
||||
numa_node = bpf_map_attr_numa_node(attr);
|
||||
size = struct_size(m, xsk_map, attr->max_entries);
|
||||
cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());
|
||||
|
||||
err = bpf_map_charge_init(&mem, cost);
|
||||
err = bpf_map_charge_init(&mem, size);
|
||||
if (err < 0)
|
||||
return ERR_PTR(err);
|
||||
|
||||
@@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
|
||||
bpf_map_charge_move(&m->map.memory, &mem);
|
||||
spin_lock_init(&m->lock);
|
||||
|
||||
m->flush_list = alloc_percpu(struct list_head);
|
||||
if (!m->flush_list) {
|
||||
bpf_map_charge_finish(&m->map.memory);
|
||||
bpf_map_area_free(m);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
|
||||
|
||||
return &m->map;
|
||||
}
|
||||
|
||||
@@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map)
|
||||
|
||||
bpf_clear_redirect_map(map);
|
||||
synchronize_net();
|
||||
free_percpu(m->flush_list);
|
||||
bpf_map_area_free(m);
|
||||
}
|
||||
|
||||
|
||||
@@ -6289,12 +6289,13 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
|
||||
|
||||
#ifdef CONFIG_CGROUP_BPF
|
||||
int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
|
||||
enum bpf_attach_type type, u32 flags)
|
||||
struct bpf_prog *replace_prog, enum bpf_attach_type type,
|
||||
u32 flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
|
||||
ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
13
kernel/cpu.c
13
kernel/cpu.c
@@ -525,8 +525,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
|
||||
if (WARN_ON_ONCE((!cpu_online(cpu))))
|
||||
return -ECANCELED;
|
||||
|
||||
/* Unpark the stopper thread and the hotplug thread of the target cpu */
|
||||
stop_machine_unpark(cpu);
|
||||
/* Unpark the hotplug thread of the target cpu */
|
||||
kthread_unpark(st->thread);
|
||||
|
||||
/*
|
||||
@@ -1089,8 +1088,8 @@ void notify_cpu_starting(unsigned int cpu)
|
||||
|
||||
/*
|
||||
* Called from the idle task. Wake up the controlling task which brings the
|
||||
* stopper and the hotplug thread of the upcoming CPU up and then delegates
|
||||
* the rest of the online bringup to the hotplug thread.
|
||||
* hotplug thread of the upcoming CPU up and then delegates the rest of the
|
||||
* online bringup to the hotplug thread.
|
||||
*/
|
||||
void cpuhp_online_idle(enum cpuhp_state state)
|
||||
{
|
||||
@@ -1100,6 +1099,12 @@ void cpuhp_online_idle(enum cpuhp_state state)
|
||||
if (state != CPUHP_AP_ONLINE_IDLE)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Unpart the stopper thread before we start the idle loop (and start
|
||||
* scheduling); this ensures the stopper task is always available.
|
||||
*/
|
||||
stop_machine_unpark(smp_processor_id());
|
||||
|
||||
st->state = CPUHP_AP_ONLINE_IDLE;
|
||||
complete_ap_thread(st, true);
|
||||
}
|
||||
|
||||
@@ -131,8 +131,9 @@ int kernel_text_address(unsigned long addr)
|
||||
* triggers a stack trace, or a WARN() that happens during
|
||||
* coming back from idle, or cpu on or offlining.
|
||||
*
|
||||
* is_module_text_address() as well as the kprobe slots
|
||||
* and is_bpf_text_address() require RCU to be watching.
|
||||
* is_module_text_address() as well as the kprobe slots,
|
||||
* is_bpf_text_address() and is_bpf_image_address require
|
||||
* RCU to be watching.
|
||||
*/
|
||||
no_rcu = !rcu_is_watching();
|
||||
|
||||
@@ -148,6 +149,8 @@ int kernel_text_address(unsigned long addr)
|
||||
goto out;
|
||||
if (is_bpf_text_address(addr))
|
||||
goto out;
|
||||
if (is_bpf_image_address(addr))
|
||||
goto out;
|
||||
ret = 0;
|
||||
out:
|
||||
if (no_rcu)
|
||||
|
||||
@@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling"
|
||||
config GCOV_KERNEL
|
||||
bool "Enable gcov-based kernel profiling"
|
||||
depends on DEBUG_FS
|
||||
select CONSTRUCTORS
|
||||
select CONSTRUCTORS if !UML
|
||||
default n
|
||||
---help---
|
||||
This option enables gcov-based code profiling (e.g. for code coverage
|
||||
|
||||
@@ -510,6 +510,8 @@ static void do_unoptimize_kprobes(void)
|
||||
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
|
||||
/* Loop free_list for disarming */
|
||||
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
|
||||
/* Switching from detour code to origin */
|
||||
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
|
||||
/* Disarm probes if marked disabled */
|
||||
if (kprobe_disabled(&op->kp))
|
||||
arch_disarm_kprobe(&op->kp);
|
||||
@@ -610,6 +612,18 @@ void wait_for_kprobe_optimizer(void)
|
||||
mutex_unlock(&kprobe_mutex);
|
||||
}
|
||||
|
||||
static bool optprobe_queued_unopt(struct optimized_kprobe *op)
|
||||
{
|
||||
struct optimized_kprobe *_op;
|
||||
|
||||
list_for_each_entry(_op, &unoptimizing_list, list) {
|
||||
if (op == _op)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Optimize kprobe if p is ready to be optimized */
|
||||
static void optimize_kprobe(struct kprobe *p)
|
||||
{
|
||||
@@ -631,17 +645,21 @@ static void optimize_kprobe(struct kprobe *p)
|
||||
return;
|
||||
|
||||
/* Check if it is already optimized. */
|
||||
if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
|
||||
if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
|
||||
if (optprobe_queued_unopt(op)) {
|
||||
/* This is under unoptimizing. Just dequeue the probe */
|
||||
list_del_init(&op->list);
|
||||
}
|
||||
return;
|
||||
}
|
||||
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
|
||||
|
||||
if (!list_empty(&op->list))
|
||||
/* This is under unoptimizing. Just dequeue the probe */
|
||||
list_del_init(&op->list);
|
||||
else {
|
||||
list_add(&op->list, &optimizing_list);
|
||||
kick_kprobe_optimizer();
|
||||
}
|
||||
/* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
|
||||
if (WARN_ON_ONCE(!list_empty(&op->list)))
|
||||
return;
|
||||
|
||||
list_add(&op->list, &optimizing_list);
|
||||
kick_kprobe_optimizer();
|
||||
}
|
||||
|
||||
/* Short cut to direct unoptimizing */
|
||||
@@ -649,6 +667,7 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op)
|
||||
{
|
||||
lockdep_assert_cpus_held();
|
||||
arch_unoptimize_kprobe(op);
|
||||
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
|
||||
if (kprobe_disabled(&op->kp))
|
||||
arch_disarm_kprobe(&op->kp);
|
||||
}
|
||||
@@ -662,31 +681,33 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
|
||||
return; /* This is not an optprobe nor optimized */
|
||||
|
||||
op = container_of(p, struct optimized_kprobe, kp);
|
||||
if (!kprobe_optimized(p)) {
|
||||
/* Unoptimized or unoptimizing case */
|
||||
if (force && !list_empty(&op->list)) {
|
||||
/*
|
||||
* Only if this is unoptimizing kprobe and forced,
|
||||
* forcibly unoptimize it. (No need to unoptimize
|
||||
* unoptimized kprobe again :)
|
||||
*/
|
||||
if (!kprobe_optimized(p))
|
||||
return;
|
||||
|
||||
if (!list_empty(&op->list)) {
|
||||
if (optprobe_queued_unopt(op)) {
|
||||
/* Queued in unoptimizing queue */
|
||||
if (force) {
|
||||
/*
|
||||
* Forcibly unoptimize the kprobe here, and queue it
|
||||
* in the freeing list for release afterwards.
|
||||
*/
|
||||
force_unoptimize_kprobe(op);
|
||||
list_move(&op->list, &freeing_list);
|
||||
}
|
||||
} else {
|
||||
/* Dequeue from the optimizing queue */
|
||||
list_del_init(&op->list);
|
||||
force_unoptimize_kprobe(op);
|
||||
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
|
||||
if (!list_empty(&op->list)) {
|
||||
/* Dequeue from the optimization queue */
|
||||
list_del_init(&op->list);
|
||||
return;
|
||||
}
|
||||
/* Optimized kprobe case */
|
||||
if (force)
|
||||
if (force) {
|
||||
/* Forcibly update the code: this is a special case */
|
||||
force_unoptimize_kprobe(op);
|
||||
else {
|
||||
} else {
|
||||
list_add(&op->list, &unoptimizing_list);
|
||||
kick_kprobe_optimizer();
|
||||
}
|
||||
|
||||
@@ -286,9 +286,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
|
||||
seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
|
||||
nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
|
||||
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
|
||||
seq_printf(m, " number of stack traces: %llu\n",
|
||||
seq_printf(m, " number of stack traces: %11llu\n",
|
||||
lockdep_stack_trace_count());
|
||||
seq_printf(m, " number of stack hash chains: %llu\n",
|
||||
seq_printf(m, " number of stack hash chains: %11llu\n",
|
||||
lockdep_stack_hash_count());
|
||||
#endif
|
||||
seq_printf(m, " combined max dependencies: %11u\n",
|
||||
|
||||
@@ -134,20 +134,17 @@ bool osq_lock(struct optimistic_spin_queue *lock)
|
||||
* cmpxchg in an attempt to undo our queueing.
|
||||
*/
|
||||
|
||||
while (!READ_ONCE(node->locked)) {
|
||||
/*
|
||||
* If we need to reschedule bail... so we can block.
|
||||
* Use vcpu_is_preempted() to avoid waiting for a preempted
|
||||
* lock holder:
|
||||
*/
|
||||
if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
|
||||
goto unqueue;
|
||||
/*
|
||||
* Wait to acquire the lock or cancelation. Note that need_resched()
|
||||
* will come with an IPI, which will wake smp_cond_load_relaxed() if it
|
||||
* is implemented with a monitor-wait. vcpu_is_preempted() relies on
|
||||
* polling, be careful.
|
||||
*/
|
||||
if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
|
||||
vcpu_is_preempted(node_cpu(node->prev))))
|
||||
return true;
|
||||
|
||||
cpu_relax();
|
||||
}
|
||||
return true;
|
||||
|
||||
unqueue:
|
||||
/* unqueue */
|
||||
/*
|
||||
* Step - A -- stabilize @prev
|
||||
*
|
||||
|
||||
@@ -31,14 +31,15 @@
|
||||
/*
|
||||
* The basic principle of a queue-based spinlock can best be understood
|
||||
* by studying a classic queue-based spinlock implementation called the
|
||||
* MCS lock. The paper below provides a good description for this kind
|
||||
* of lock.
|
||||
* MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
|
||||
* Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
|
||||
* Scott") is available at
|
||||
*
|
||||
* http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
|
||||
* https://bugzilla.kernel.org/show_bug.cgi?id=206115
|
||||
*
|
||||
* This queued spinlock implementation is based on the MCS lock, however to make
|
||||
* it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
|
||||
* API, we must modify it somehow.
|
||||
* This queued spinlock implementation is based on the MCS lock, however to
|
||||
* make it fit the 4 bytes we assume spinlock_t to be, and preserve its
|
||||
* existing API, we must modify it somehow.
|
||||
*
|
||||
* In particular; where the traditional MCS lock consists of a tail pointer
|
||||
* (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
|
||||
|
||||
@@ -2031,49 +2031,6 @@ static void module_enable_nx(const struct module *mod)
|
||||
frob_writable_data(&mod->init_layout, set_memory_nx);
|
||||
}
|
||||
|
||||
/* Iterate through all modules and set each module's text as RW */
|
||||
void set_all_modules_text_rw(void)
|
||||
{
|
||||
struct module *mod;
|
||||
|
||||
if (!rodata_enabled)
|
||||
return;
|
||||
|
||||
mutex_lock(&module_mutex);
|
||||
list_for_each_entry_rcu(mod, &modules, list) {
|
||||
if (mod->state == MODULE_STATE_UNFORMED)
|
||||
continue;
|
||||
|
||||
frob_text(&mod->core_layout, set_memory_rw);
|
||||
frob_text(&mod->init_layout, set_memory_rw);
|
||||
}
|
||||
mutex_unlock(&module_mutex);
|
||||
}
|
||||
|
||||
/* Iterate through all modules and set each module's text as RO */
|
||||
void set_all_modules_text_ro(void)
|
||||
{
|
||||
struct module *mod;
|
||||
|
||||
if (!rodata_enabled)
|
||||
return;
|
||||
|
||||
mutex_lock(&module_mutex);
|
||||
list_for_each_entry_rcu(mod, &modules, list) {
|
||||
/*
|
||||
* Ignore going modules since it's possible that ro
|
||||
* protection has already been disabled, otherwise we'll
|
||||
* run into protection faults at module deallocation.
|
||||
*/
|
||||
if (mod->state == MODULE_STATE_UNFORMED ||
|
||||
mod->state == MODULE_STATE_GOING)
|
||||
continue;
|
||||
|
||||
frob_text(&mod->core_layout, set_memory_ro);
|
||||
frob_text(&mod->init_layout, set_memory_ro);
|
||||
}
|
||||
mutex_unlock(&module_mutex);
|
||||
}
|
||||
#else /* !CONFIG_STRICT_MODULE_RWX */
|
||||
static void module_enable_nx(const struct module *mod) { }
|
||||
#endif /* CONFIG_STRICT_MODULE_RWX */
|
||||
|
||||
388
kernel/padata.c
388
kernel/padata.c
@@ -2,7 +2,7 @@
|
||||
/*
|
||||
* padata.c - generic interface to process data streams in parallel
|
||||
*
|
||||
* See Documentation/padata.txt for an api documentation.
|
||||
* See Documentation/core-api/padata.rst for more information.
|
||||
*
|
||||
* Copyright (C) 2008, 2009 secunet Security Networks AG
|
||||
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
|
||||
@@ -35,6 +35,8 @@
|
||||
|
||||
#define MAX_OBJ_NUM 1000
|
||||
|
||||
static void padata_free_pd(struct parallel_data *pd);
|
||||
|
||||
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
|
||||
{
|
||||
int cpu, target_cpu;
|
||||
@@ -87,7 +89,7 @@ static void padata_parallel_worker(struct work_struct *parallel_work)
|
||||
/**
|
||||
* padata_do_parallel - padata parallelization function
|
||||
*
|
||||
* @pinst: padata instance
|
||||
* @ps: padatashell
|
||||
* @padata: object to be parallelized
|
||||
* @cb_cpu: pointer to the CPU that the serialization callback function should
|
||||
* run on. If it's not in the serial cpumask of @pinst
|
||||
@@ -97,17 +99,20 @@ static void padata_parallel_worker(struct work_struct *parallel_work)
|
||||
* The parallelization callback function will run with BHs off.
|
||||
* Note: Every object which is parallelized by padata_do_parallel
|
||||
* must be seen by padata_do_serial.
|
||||
*
|
||||
* Return: 0 on success or else negative error code.
|
||||
*/
|
||||
int padata_do_parallel(struct padata_instance *pinst,
|
||||
int padata_do_parallel(struct padata_shell *ps,
|
||||
struct padata_priv *padata, int *cb_cpu)
|
||||
{
|
||||
struct padata_instance *pinst = ps->pinst;
|
||||
int i, cpu, cpu_index, target_cpu, err;
|
||||
struct padata_parallel_queue *queue;
|
||||
struct parallel_data *pd;
|
||||
|
||||
rcu_read_lock_bh();
|
||||
|
||||
pd = rcu_dereference_bh(pinst->pd);
|
||||
pd = rcu_dereference_bh(ps->pd);
|
||||
|
||||
err = -EINVAL;
|
||||
if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
|
||||
@@ -160,14 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel);
|
||||
/*
|
||||
* padata_find_next - Find the next object that needs serialization.
|
||||
*
|
||||
* Return values are:
|
||||
*
|
||||
* A pointer to the control struct of the next object that needs
|
||||
* serialization, if present in one of the percpu reorder queues.
|
||||
*
|
||||
* NULL, if the next object that needs serialization will
|
||||
* be parallel processed by another cpu and is not yet present in
|
||||
* the cpu's reorder queue.
|
||||
* Return:
|
||||
* * A pointer to the control struct of the next object that needs
|
||||
* serialization, if present in one of the percpu reorder queues.
|
||||
* * NULL, if the next object that needs serialization will
|
||||
* be parallel processed by another cpu and is not yet present in
|
||||
* the cpu's reorder queue.
|
||||
*/
|
||||
static struct padata_priv *padata_find_next(struct parallel_data *pd,
|
||||
bool remove_object)
|
||||
@@ -199,7 +202,6 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
|
||||
|
||||
if (remove_object) {
|
||||
list_del_init(&padata->list);
|
||||
atomic_dec(&pd->reorder_objects);
|
||||
++pd->processed;
|
||||
pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false);
|
||||
}
|
||||
@@ -210,10 +212,10 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
|
||||
|
||||
static void padata_reorder(struct parallel_data *pd)
|
||||
{
|
||||
struct padata_instance *pinst = pd->ps->pinst;
|
||||
int cb_cpu;
|
||||
struct padata_priv *padata;
|
||||
struct padata_serial_queue *squeue;
|
||||
struct padata_instance *pinst = pd->pinst;
|
||||
struct padata_parallel_queue *next_queue;
|
||||
|
||||
/*
|
||||
@@ -283,6 +285,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
|
||||
struct padata_serial_queue *squeue;
|
||||
struct parallel_data *pd;
|
||||
LIST_HEAD(local_list);
|
||||
int cnt;
|
||||
|
||||
local_bh_disable();
|
||||
squeue = container_of(serial_work, struct padata_serial_queue, work);
|
||||
@@ -292,6 +295,8 @@ static void padata_serial_worker(struct work_struct *serial_work)
|
||||
list_replace_init(&squeue->serial.list, &local_list);
|
||||
spin_unlock(&squeue->serial.lock);
|
||||
|
||||
cnt = 0;
|
||||
|
||||
while (!list_empty(&local_list)) {
|
||||
struct padata_priv *padata;
|
||||
|
||||
@@ -301,9 +306,12 @@ static void padata_serial_worker(struct work_struct *serial_work)
|
||||
list_del_init(&padata->list);
|
||||
|
||||
padata->serial(padata);
|
||||
atomic_dec(&pd->refcnt);
|
||||
cnt++;
|
||||
}
|
||||
local_bh_enable();
|
||||
|
||||
if (atomic_sub_and_test(cnt, &pd->refcnt))
|
||||
padata_free_pd(pd);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -327,7 +335,6 @@ void padata_do_serial(struct padata_priv *padata)
|
||||
if (cur->seq_nr < padata->seq_nr)
|
||||
break;
|
||||
list_add(&padata->list, &cur->list);
|
||||
atomic_inc(&pd->reorder_objects);
|
||||
spin_unlock(&pqueue->reorder.lock);
|
||||
|
||||
/*
|
||||
@@ -341,36 +348,39 @@ void padata_do_serial(struct padata_priv *padata)
|
||||
}
|
||||
EXPORT_SYMBOL(padata_do_serial);
|
||||
|
||||
static int padata_setup_cpumasks(struct parallel_data *pd,
|
||||
const struct cpumask *pcpumask,
|
||||
const struct cpumask *cbcpumask)
|
||||
static int padata_setup_cpumasks(struct padata_instance *pinst)
|
||||
{
|
||||
struct workqueue_attrs *attrs;
|
||||
int err;
|
||||
|
||||
attrs = alloc_workqueue_attrs();
|
||||
if (!attrs)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Restrict parallel_wq workers to pd->cpumask.pcpu. */
|
||||
cpumask_copy(attrs->cpumask, pinst->cpumask.pcpu);
|
||||
err = apply_workqueue_attrs(pinst->parallel_wq, attrs);
|
||||
free_workqueue_attrs(attrs);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int pd_setup_cpumasks(struct parallel_data *pd,
|
||||
const struct cpumask *pcpumask,
|
||||
const struct cpumask *cbcpumask)
|
||||
{
|
||||
int err = -ENOMEM;
|
||||
|
||||
if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
|
||||
goto out;
|
||||
cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
|
||||
|
||||
if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
|
||||
goto free_pcpu_mask;
|
||||
cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
|
||||
|
||||
attrs = alloc_workqueue_attrs();
|
||||
if (!attrs)
|
||||
goto free_cbcpu_mask;
|
||||
|
||||
/* Restrict parallel_wq workers to pd->cpumask.pcpu. */
|
||||
cpumask_copy(attrs->cpumask, pd->cpumask.pcpu);
|
||||
err = apply_workqueue_attrs(pd->pinst->parallel_wq, attrs);
|
||||
free_workqueue_attrs(attrs);
|
||||
if (err < 0)
|
||||
goto free_cbcpu_mask;
|
||||
cpumask_copy(pd->cpumask.pcpu, pcpumask);
|
||||
cpumask_copy(pd->cpumask.cbcpu, cbcpumask);
|
||||
|
||||
return 0;
|
||||
|
||||
free_cbcpu_mask:
|
||||
free_cpumask_var(pd->cpumask.cbcpu);
|
||||
free_pcpu_mask:
|
||||
free_cpumask_var(pd->cpumask.pcpu);
|
||||
out:
|
||||
@@ -414,12 +424,16 @@ static void padata_init_pqueues(struct parallel_data *pd)
|
||||
}
|
||||
|
||||
/* Allocate and initialize the internal cpumask dependend resources. */
|
||||
static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
|
||||
const struct cpumask *pcpumask,
|
||||
const struct cpumask *cbcpumask)
|
||||
static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
|
||||
{
|
||||
struct padata_instance *pinst = ps->pinst;
|
||||
const struct cpumask *cbcpumask;
|
||||
const struct cpumask *pcpumask;
|
||||
struct parallel_data *pd;
|
||||
|
||||
cbcpumask = pinst->rcpumask.cbcpu;
|
||||
pcpumask = pinst->rcpumask.pcpu;
|
||||
|
||||
pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
|
||||
if (!pd)
|
||||
goto err;
|
||||
@@ -432,15 +446,14 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
|
||||
if (!pd->squeue)
|
||||
goto err_free_pqueue;
|
||||
|
||||
pd->pinst = pinst;
|
||||
if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
|
||||
pd->ps = ps;
|
||||
if (pd_setup_cpumasks(pd, pcpumask, cbcpumask))
|
||||
goto err_free_squeue;
|
||||
|
||||
padata_init_pqueues(pd);
|
||||
padata_init_squeues(pd);
|
||||
atomic_set(&pd->seq_nr, -1);
|
||||
atomic_set(&pd->reorder_objects, 0);
|
||||
atomic_set(&pd->refcnt, 0);
|
||||
atomic_set(&pd->refcnt, 1);
|
||||
spin_lock_init(&pd->lock);
|
||||
pd->cpu = cpumask_first(pd->cpumask.pcpu);
|
||||
INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
|
||||
@@ -466,29 +479,6 @@ static void padata_free_pd(struct parallel_data *pd)
|
||||
kfree(pd);
|
||||
}
|
||||
|
||||
/* Flush all objects out of the padata queues. */
|
||||
static void padata_flush_queues(struct parallel_data *pd)
|
||||
{
|
||||
int cpu;
|
||||
struct padata_parallel_queue *pqueue;
|
||||
struct padata_serial_queue *squeue;
|
||||
|
||||
for_each_cpu(cpu, pd->cpumask.pcpu) {
|
||||
pqueue = per_cpu_ptr(pd->pqueue, cpu);
|
||||
flush_work(&pqueue->work);
|
||||
}
|
||||
|
||||
if (atomic_read(&pd->reorder_objects))
|
||||
padata_reorder(pd);
|
||||
|
||||
for_each_cpu(cpu, pd->cpumask.cbcpu) {
|
||||
squeue = per_cpu_ptr(pd->squeue, cpu);
|
||||
flush_work(&squeue->work);
|
||||
}
|
||||
|
||||
BUG_ON(atomic_read(&pd->refcnt) != 0);
|
||||
}
|
||||
|
||||
static void __padata_start(struct padata_instance *pinst)
|
||||
{
|
||||
pinst->flags |= PADATA_INIT;
|
||||
@@ -502,72 +492,52 @@ static void __padata_stop(struct padata_instance *pinst)
|
||||
pinst->flags &= ~PADATA_INIT;
|
||||
|
||||
synchronize_rcu();
|
||||
|
||||
get_online_cpus();
|
||||
padata_flush_queues(pinst->pd);
|
||||
put_online_cpus();
|
||||
}
|
||||
|
||||
/* Replace the internal control structure with a new one. */
|
||||
static void padata_replace(struct padata_instance *pinst,
|
||||
struct parallel_data *pd_new)
|
||||
static int padata_replace_one(struct padata_shell *ps)
|
||||
{
|
||||
struct parallel_data *pd_old = pinst->pd;
|
||||
int notification_mask = 0;
|
||||
struct parallel_data *pd_new;
|
||||
|
||||
pd_new = padata_alloc_pd(ps);
|
||||
if (!pd_new)
|
||||
return -ENOMEM;
|
||||
|
||||
ps->opd = rcu_dereference_protected(ps->pd, 1);
|
||||
rcu_assign_pointer(ps->pd, pd_new);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int padata_replace(struct padata_instance *pinst)
|
||||
{
|
||||
struct padata_shell *ps;
|
||||
int err;
|
||||
|
||||
pinst->flags |= PADATA_RESET;
|
||||
|
||||
rcu_assign_pointer(pinst->pd, pd_new);
|
||||
cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu,
|
||||
cpu_online_mask);
|
||||
|
||||
cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu,
|
||||
cpu_online_mask);
|
||||
|
||||
list_for_each_entry(ps, &pinst->pslist, list) {
|
||||
err = padata_replace_one(ps);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
|
||||
synchronize_rcu();
|
||||
|
||||
if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
|
||||
notification_mask |= PADATA_CPU_PARALLEL;
|
||||
if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
|
||||
notification_mask |= PADATA_CPU_SERIAL;
|
||||
|
||||
padata_flush_queues(pd_old);
|
||||
padata_free_pd(pd_old);
|
||||
|
||||
if (notification_mask)
|
||||
blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
|
||||
notification_mask,
|
||||
&pd_new->cpumask);
|
||||
list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
|
||||
if (atomic_dec_and_test(&ps->opd->refcnt))
|
||||
padata_free_pd(ps->opd);
|
||||
|
||||
pinst->flags &= ~PADATA_RESET;
|
||||
}
|
||||
|
||||
/**
|
||||
* padata_register_cpumask_notifier - Registers a notifier that will be called
|
||||
* if either pcpu or cbcpu or both cpumasks change.
|
||||
*
|
||||
* @pinst: A poineter to padata instance
|
||||
* @nblock: A pointer to notifier block.
|
||||
*/
|
||||
int padata_register_cpumask_notifier(struct padata_instance *pinst,
|
||||
struct notifier_block *nblock)
|
||||
{
|
||||
return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
|
||||
nblock);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(padata_register_cpumask_notifier);
|
||||
|
||||
/**
|
||||
* padata_unregister_cpumask_notifier - Unregisters cpumask notifier
|
||||
* registered earlier using padata_register_cpumask_notifier
|
||||
*
|
||||
* @pinst: A pointer to data instance.
|
||||
* @nlock: A pointer to notifier block.
|
||||
*/
|
||||
int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
|
||||
struct notifier_block *nblock)
|
||||
{
|
||||
return blocking_notifier_chain_unregister(
|
||||
&pinst->cpumask_change_notifier,
|
||||
nblock);
|
||||
}
|
||||
EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
|
||||
|
||||
|
||||
/* If cpumask contains no active cpu, we mark the instance as invalid. */
|
||||
static bool padata_validate_cpumask(struct padata_instance *pinst,
|
||||
@@ -587,7 +557,7 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
|
||||
cpumask_var_t cbcpumask)
|
||||
{
|
||||
int valid;
|
||||
struct parallel_data *pd;
|
||||
int err;
|
||||
|
||||
valid = padata_validate_cpumask(pinst, pcpumask);
|
||||
if (!valid) {
|
||||
@@ -600,29 +570,26 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
|
||||
__padata_stop(pinst);
|
||||
|
||||
out_replace:
|
||||
pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
|
||||
if (!pd)
|
||||
return -ENOMEM;
|
||||
|
||||
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
|
||||
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
|
||||
|
||||
padata_replace(pinst, pd);
|
||||
err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst);
|
||||
|
||||
if (valid)
|
||||
__padata_start(pinst);
|
||||
|
||||
return 0;
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
* padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
|
||||
* equivalent to @cpumask.
|
||||
*
|
||||
* padata_set_cpumask - Sets specified by @cpumask_type cpumask to the value
|
||||
* equivalent to @cpumask.
|
||||
* @pinst: padata instance
|
||||
* @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
|
||||
* to parallel and serial cpumasks respectively.
|
||||
* @cpumask: the cpumask to use
|
||||
*
|
||||
* Return: 0 on success or negative error code
|
||||
*/
|
||||
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
|
||||
cpumask_var_t cpumask)
|
||||
@@ -630,8 +597,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
|
||||
struct cpumask *serial_mask, *parallel_mask;
|
||||
int err = -EINVAL;
|
||||
|
||||
mutex_lock(&pinst->lock);
|
||||
get_online_cpus();
|
||||
mutex_lock(&pinst->lock);
|
||||
|
||||
switch (cpumask_type) {
|
||||
case PADATA_CPU_PARALLEL:
|
||||
@@ -649,8 +616,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
|
||||
err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
|
||||
|
||||
out:
|
||||
put_online_cpus();
|
||||
mutex_unlock(&pinst->lock);
|
||||
put_online_cpus();
|
||||
|
||||
return err;
|
||||
}
|
||||
@@ -660,6 +627,8 @@ EXPORT_SYMBOL(padata_set_cpumask);
|
||||
* padata_start - start the parallel processing
|
||||
*
|
||||
* @pinst: padata instance to start
|
||||
*
|
||||
* Return: 0 on success or negative error code
|
||||
*/
|
||||
int padata_start(struct padata_instance *pinst)
|
||||
{
|
||||
@@ -695,82 +664,33 @@ EXPORT_SYMBOL(padata_stop);
|
||||
|
||||
static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
|
||||
{
|
||||
struct parallel_data *pd;
|
||||
int err = 0;
|
||||
|
||||
if (cpumask_test_cpu(cpu, cpu_online_mask)) {
|
||||
pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
|
||||
pinst->cpumask.cbcpu);
|
||||
if (!pd)
|
||||
return -ENOMEM;
|
||||
|
||||
padata_replace(pinst, pd);
|
||||
err = padata_replace(pinst);
|
||||
|
||||
if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
|
||||
padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
|
||||
__padata_start(pinst);
|
||||
}
|
||||
|
||||
return 0;
|
||||
return err;
|
||||
}
|
||||
|
||||
static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
|
||||
{
|
||||
struct parallel_data *pd = NULL;
|
||||
|
||||
if (cpumask_test_cpu(cpu, cpu_online_mask)) {
|
||||
int err = 0;
|
||||
|
||||
if (!cpumask_test_cpu(cpu, cpu_online_mask)) {
|
||||
if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
|
||||
!padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
|
||||
__padata_stop(pinst);
|
||||
|
||||
pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
|
||||
pinst->cpumask.cbcpu);
|
||||
if (!pd)
|
||||
return -ENOMEM;
|
||||
|
||||
padata_replace(pinst, pd);
|
||||
|
||||
cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
|
||||
cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
|
||||
err = padata_replace(pinst);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
|
||||
* padata cpumasks.
|
||||
*
|
||||
* @pinst: padata instance
|
||||
* @cpu: cpu to remove
|
||||
* @mask: bitmask specifying from which cpumask @cpu should be removed
|
||||
* The @mask may be any combination of the following flags:
|
||||
* PADATA_CPU_SERIAL - serial cpumask
|
||||
* PADATA_CPU_PARALLEL - parallel cpumask
|
||||
*/
|
||||
int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&pinst->lock);
|
||||
|
||||
get_online_cpus();
|
||||
if (mask & PADATA_CPU_SERIAL)
|
||||
cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
|
||||
if (mask & PADATA_CPU_PARALLEL)
|
||||
cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
|
||||
|
||||
err = __padata_remove_cpu(pinst, cpu);
|
||||
put_online_cpus();
|
||||
|
||||
mutex_unlock(&pinst->lock);
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(padata_remove_cpu);
|
||||
|
||||
static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
|
||||
{
|
||||
@@ -793,7 +713,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node)
|
||||
static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node)
|
||||
{
|
||||
struct padata_instance *pinst;
|
||||
int ret;
|
||||
@@ -814,11 +734,15 @@ static enum cpuhp_state hp_online;
|
||||
static void __padata_free(struct padata_instance *pinst)
|
||||
{
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node);
|
||||
cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node);
|
||||
#endif
|
||||
|
||||
WARN_ON(!list_empty(&pinst->pslist));
|
||||
|
||||
padata_stop(pinst);
|
||||
padata_free_pd(pinst->pd);
|
||||
free_cpumask_var(pinst->rcpumask.cbcpu);
|
||||
free_cpumask_var(pinst->rcpumask.pcpu);
|
||||
free_cpumask_var(pinst->cpumask.pcpu);
|
||||
free_cpumask_var(pinst->cpumask.cbcpu);
|
||||
destroy_workqueue(pinst->serial_wq);
|
||||
@@ -959,13 +883,14 @@ static struct kobj_type padata_attr_type = {
|
||||
* @name: used to identify the instance
|
||||
* @pcpumask: cpumask that will be used for padata parallelization
|
||||
* @cbcpumask: cpumask that will be used for padata serialization
|
||||
*
|
||||
* Return: new instance on success, NULL on error
|
||||
*/
|
||||
static struct padata_instance *padata_alloc(const char *name,
|
||||
const struct cpumask *pcpumask,
|
||||
const struct cpumask *cbcpumask)
|
||||
{
|
||||
struct padata_instance *pinst;
|
||||
struct parallel_data *pd = NULL;
|
||||
|
||||
pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
|
||||
if (!pinst)
|
||||
@@ -993,29 +918,40 @@ static struct padata_instance *padata_alloc(const char *name,
|
||||
!padata_validate_cpumask(pinst, cbcpumask))
|
||||
goto err_free_masks;
|
||||
|
||||
pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
|
||||
if (!pd)
|
||||
if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL))
|
||||
goto err_free_masks;
|
||||
if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL))
|
||||
goto err_free_rcpumask_pcpu;
|
||||
|
||||
rcu_assign_pointer(pinst->pd, pd);
|
||||
INIT_LIST_HEAD(&pinst->pslist);
|
||||
|
||||
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
|
||||
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
|
||||
cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask);
|
||||
cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask);
|
||||
|
||||
if (padata_setup_cpumasks(pinst))
|
||||
goto err_free_rcpumask_cbcpu;
|
||||
|
||||
pinst->flags = 0;
|
||||
|
||||
BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
|
||||
kobject_init(&pinst->kobj, &padata_attr_type);
|
||||
mutex_init(&pinst->lock);
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node);
|
||||
cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD,
|
||||
&pinst->node);
|
||||
#endif
|
||||
|
||||
put_online_cpus();
|
||||
|
||||
return pinst;
|
||||
|
||||
err_free_rcpumask_cbcpu:
|
||||
free_cpumask_var(pinst->rcpumask.cbcpu);
|
||||
err_free_rcpumask_pcpu:
|
||||
free_cpumask_var(pinst->rcpumask.pcpu);
|
||||
err_free_masks:
|
||||
free_cpumask_var(pinst->cpumask.pcpu);
|
||||
free_cpumask_var(pinst->cpumask.cbcpu);
|
||||
@@ -1036,6 +972,8 @@ err:
|
||||
* parallel workers.
|
||||
*
|
||||
* @name: used to identify the instance
|
||||
*
|
||||
* Return: new instance on success, NULL on error
|
||||
*/
|
||||
struct padata_instance *padata_alloc_possible(const char *name)
|
||||
{
|
||||
@@ -1046,7 +984,7 @@ EXPORT_SYMBOL(padata_alloc_possible);
|
||||
/**
|
||||
* padata_free - free a padata instance
|
||||
*
|
||||
* @padata_inst: padata instance to free
|
||||
* @pinst: padata instance to free
|
||||
*/
|
||||
void padata_free(struct padata_instance *pinst)
|
||||
{
|
||||
@@ -1054,6 +992,63 @@ void padata_free(struct padata_instance *pinst)
|
||||
}
|
||||
EXPORT_SYMBOL(padata_free);
|
||||
|
||||
/**
|
||||
* padata_alloc_shell - Allocate and initialize padata shell.
|
||||
*
|
||||
* @pinst: Parent padata_instance object.
|
||||
*
|
||||
* Return: new shell on success, NULL on error
|
||||
*/
|
||||
struct padata_shell *padata_alloc_shell(struct padata_instance *pinst)
|
||||
{
|
||||
struct parallel_data *pd;
|
||||
struct padata_shell *ps;
|
||||
|
||||
ps = kzalloc(sizeof(*ps), GFP_KERNEL);
|
||||
if (!ps)
|
||||
goto out;
|
||||
|
||||
ps->pinst = pinst;
|
||||
|
||||
get_online_cpus();
|
||||
pd = padata_alloc_pd(ps);
|
||||
put_online_cpus();
|
||||
|
||||
if (!pd)
|
||||
goto out_free_ps;
|
||||
|
||||
mutex_lock(&pinst->lock);
|
||||
RCU_INIT_POINTER(ps->pd, pd);
|
||||
list_add(&ps->list, &pinst->pslist);
|
||||
mutex_unlock(&pinst->lock);
|
||||
|
||||
return ps;
|
||||
|
||||
out_free_ps:
|
||||
kfree(ps);
|
||||
out:
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(padata_alloc_shell);
|
||||
|
||||
/**
|
||||
* padata_free_shell - free a padata shell
|
||||
*
|
||||
* @ps: padata shell to free
|
||||
*/
|
||||
void padata_free_shell(struct padata_shell *ps)
|
||||
{
|
||||
struct padata_instance *pinst = ps->pinst;
|
||||
|
||||
mutex_lock(&pinst->lock);
|
||||
list_del(&ps->list);
|
||||
padata_free_pd(rcu_dereference_protected(ps->pd, 1));
|
||||
mutex_unlock(&pinst->lock);
|
||||
|
||||
kfree(ps);
|
||||
}
|
||||
EXPORT_SYMBOL(padata_free_shell);
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
|
||||
static __init int padata_driver_init(void)
|
||||
@@ -1061,17 +1056,24 @@ static __init int padata_driver_init(void)
|
||||
int ret;
|
||||
|
||||
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
|
||||
padata_cpu_online,
|
||||
padata_cpu_prep_down);
|
||||
padata_cpu_online, NULL);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
hp_online = ret;
|
||||
|
||||
ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead",
|
||||
NULL, padata_cpu_dead);
|
||||
if (ret < 0) {
|
||||
cpuhp_remove_multi_state(hp_online);
|
||||
return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
module_init(padata_driver_init);
|
||||
|
||||
static __exit void padata_driver_exit(void)
|
||||
{
|
||||
cpuhp_remove_multi_state(CPUHP_PADATA_DEAD);
|
||||
cpuhp_remove_multi_state(hp_online);
|
||||
}
|
||||
module_exit(padata_driver_exit);
|
||||
|
||||
@@ -7,7 +7,7 @@ menu "RCU Subsystem"
|
||||
|
||||
config TREE_RCU
|
||||
bool
|
||||
default y if !PREEMPTION && SMP
|
||||
default y if SMP
|
||||
help
|
||||
This option selects the RCU implementation that is
|
||||
designed for very large SMP system with hundreds or
|
||||
@@ -17,6 +17,7 @@ config TREE_RCU
|
||||
config PREEMPT_RCU
|
||||
bool
|
||||
default y if PREEMPTION
|
||||
select TREE_RCU
|
||||
help
|
||||
This option selects the RCU implementation that is
|
||||
designed for very large SMP systems with hundreds or
|
||||
@@ -78,7 +79,7 @@ config TASKS_RCU
|
||||
user-mode execution as quiescent states.
|
||||
|
||||
config RCU_STALL_COMMON
|
||||
def_bool ( TREE_RCU || PREEMPT_RCU )
|
||||
def_bool TREE_RCU
|
||||
help
|
||||
This option enables RCU CPU stall code that is common between
|
||||
the TINY and TREE variants of RCU. The purpose is to allow
|
||||
@@ -86,13 +87,13 @@ config RCU_STALL_COMMON
|
||||
making these warnings mandatory for the tree variants.
|
||||
|
||||
config RCU_NEED_SEGCBLIST
|
||||
def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
|
||||
def_bool ( TREE_RCU || TREE_SRCU )
|
||||
|
||||
config RCU_FANOUT
|
||||
int "Tree-based hierarchical RCU fanout value"
|
||||
range 2 64 if 64BIT
|
||||
range 2 32 if !64BIT
|
||||
depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
|
||||
depends on TREE_RCU && RCU_EXPERT
|
||||
default 64 if 64BIT
|
||||
default 32 if !64BIT
|
||||
help
|
||||
@@ -112,7 +113,7 @@ config RCU_FANOUT_LEAF
|
||||
int "Tree-based hierarchical RCU leaf-level fanout value"
|
||||
range 2 64 if 64BIT
|
||||
range 2 32 if !64BIT
|
||||
depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
|
||||
depends on TREE_RCU && RCU_EXPERT
|
||||
default 16
|
||||
help
|
||||
This option controls the leaf-level fanout of hierarchical
|
||||
@@ -187,7 +188,7 @@ config RCU_BOOST_DELAY
|
||||
|
||||
config RCU_NOCB_CPU
|
||||
bool "Offload RCU callback processing from boot-selected CPUs"
|
||||
depends on TREE_RCU || PREEMPT_RCU
|
||||
depends on TREE_RCU
|
||||
depends on RCU_EXPERT || NO_HZ_FULL
|
||||
default n
|
||||
help
|
||||
@@ -200,8 +201,8 @@ config RCU_NOCB_CPU
|
||||
specified at boot time by the rcu_nocbs parameter. For each
|
||||
such CPU, a kthread ("rcuox/N") will be created to invoke
|
||||
callbacks, where the "N" is the CPU being offloaded, and where
|
||||
the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched
|
||||
(!PREEMPT kernels). Nothing prevents this kthread from running
|
||||
the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched
|
||||
(!PREEMPTION kernels). Nothing prevents this kthread from running
|
||||
on the specified CPUs, but (1) the kthreads may be preempted
|
||||
between each callback, and (2) affinity or cgroups can be used
|
||||
to force the kthreads to run on whatever set of CPUs is desired.
|
||||
|
||||
@@ -9,6 +9,5 @@ obj-$(CONFIG_TINY_SRCU) += srcutiny.o
|
||||
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
|
||||
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
|
||||
obj-$(CONFIG_TREE_RCU) += tree.o
|
||||
obj-$(CONFIG_PREEMPT_RCU) += tree.o
|
||||
obj-$(CONFIG_TINY_RCU) += tiny.o
|
||||
obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
|
||||
|
||||
@@ -198,33 +198,6 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
|
||||
}
|
||||
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
|
||||
|
||||
void kfree(const void *);
|
||||
|
||||
/*
|
||||
* Reclaim the specified callback, either by invoking it (non-lazy case)
|
||||
* or freeing it directly (lazy case). Return true if lazy, false otherwise.
|
||||
*/
|
||||
static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
|
||||
{
|
||||
rcu_callback_t f;
|
||||
unsigned long offset = (unsigned long)head->func;
|
||||
|
||||
rcu_lock_acquire(&rcu_callback_map);
|
||||
if (__is_kfree_rcu_offset(offset)) {
|
||||
trace_rcu_invoke_kfree_callback(rn, head, offset);
|
||||
kfree((void *)head - offset);
|
||||
rcu_lock_release(&rcu_callback_map);
|
||||
return true;
|
||||
} else {
|
||||
trace_rcu_invoke_callback(rn, head);
|
||||
f = head->func;
|
||||
WRITE_ONCE(head->func, (rcu_callback_t)0L);
|
||||
f(head);
|
||||
rcu_lock_release(&rcu_callback_map);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_RCU_STALL_COMMON
|
||||
|
||||
extern int rcu_cpu_stall_ftrace_dump;
|
||||
@@ -281,7 +254,7 @@ void rcu_test_sync_prims(void);
|
||||
*/
|
||||
extern void resched_cpu(int cpu);
|
||||
|
||||
#if defined(SRCU) || !defined(TINY_RCU)
|
||||
#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU)
|
||||
|
||||
#include <linux/rcu_node_tree.h>
|
||||
|
||||
@@ -418,7 +391,7 @@ do { \
|
||||
#define raw_lockdep_assert_held_rcu_node(p) \
|
||||
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
|
||||
|
||||
#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
|
||||
#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
|
||||
|
||||
#ifdef CONFIG_SRCU
|
||||
void srcu_init(void);
|
||||
@@ -454,7 +427,7 @@ enum rcutorture_type {
|
||||
INVALID_RCU_FLAVOR
|
||||
};
|
||||
|
||||
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
|
||||
#if defined(CONFIG_TREE_RCU)
|
||||
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
|
||||
unsigned long *gp_seq);
|
||||
void do_trace_rcu_torture_read(const char *rcutorturename,
|
||||
|
||||
@@ -20,14 +20,10 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
|
||||
rclp->head = NULL;
|
||||
rclp->tail = &rclp->head;
|
||||
rclp->len = 0;
|
||||
rclp->len_lazy = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Enqueue an rcu_head structure onto the specified callback list.
|
||||
* This function assumes that the callback is non-lazy because it
|
||||
* is intended for use by no-CBs CPUs, which do not distinguish
|
||||
* between lazy and non-lazy RCU callbacks.
|
||||
*/
|
||||
void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
|
||||
{
|
||||
@@ -54,7 +50,6 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
|
||||
else
|
||||
drclp->tail = &drclp->head;
|
||||
drclp->len = srclp->len;
|
||||
drclp->len_lazy = srclp->len_lazy;
|
||||
if (!rhp) {
|
||||
rcu_cblist_init(srclp);
|
||||
} else {
|
||||
@@ -62,16 +57,12 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
|
||||
srclp->head = rhp;
|
||||
srclp->tail = &rhp->next;
|
||||
WRITE_ONCE(srclp->len, 1);
|
||||
srclp->len_lazy = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Dequeue the oldest rcu_head structure from the specified callback
|
||||
* list. This function assumes that the callback is non-lazy, but
|
||||
* the caller can later invoke rcu_cblist_dequeued_lazy() if it
|
||||
* finds otherwise (and if it cares about laziness). This allows
|
||||
* different users to have different ways of determining laziness.
|
||||
* list.
|
||||
*/
|
||||
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
|
||||
{
|
||||
@@ -161,7 +152,6 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
|
||||
for (i = 0; i < RCU_CBLIST_NSEGS; i++)
|
||||
rsclp->tails[i] = &rsclp->head;
|
||||
rcu_segcblist_set_len(rsclp, 0);
|
||||
rsclp->len_lazy = 0;
|
||||
rsclp->enabled = 1;
|
||||
}
|
||||
|
||||
@@ -173,7 +163,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
|
||||
WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
|
||||
WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
|
||||
rsclp->enabled = 0;
|
||||
}
|
||||
|
||||
@@ -253,11 +242,9 @@ bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp)
|
||||
* absolutely not OK for it to ever miss posting a callback.
|
||||
*/
|
||||
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
|
||||
struct rcu_head *rhp, bool lazy)
|
||||
struct rcu_head *rhp)
|
||||
{
|
||||
rcu_segcblist_inc_len(rsclp);
|
||||
if (lazy)
|
||||
rsclp->len_lazy++;
|
||||
smp_mb(); /* Ensure counts are updated before callback is enqueued. */
|
||||
rhp->next = NULL;
|
||||
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
|
||||
@@ -275,15 +262,13 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
|
||||
* period. You have been warned.
|
||||
*/
|
||||
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
|
||||
struct rcu_head *rhp, bool lazy)
|
||||
struct rcu_head *rhp)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (rcu_segcblist_n_cbs(rsclp) == 0)
|
||||
return false;
|
||||
rcu_segcblist_inc_len(rsclp);
|
||||
if (lazy)
|
||||
rsclp->len_lazy++;
|
||||
smp_mb(); /* Ensure counts are updated before callback is entrained. */
|
||||
rhp->next = NULL;
|
||||
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
|
||||
@@ -307,8 +292,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
|
||||
void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
|
||||
struct rcu_cblist *rclp)
|
||||
{
|
||||
rclp->len_lazy += rsclp->len_lazy;
|
||||
rsclp->len_lazy = 0;
|
||||
rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
|
||||
}
|
||||
|
||||
@@ -361,9 +344,7 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
|
||||
void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
|
||||
struct rcu_cblist *rclp)
|
||||
{
|
||||
rsclp->len_lazy += rclp->len_lazy;
|
||||
rcu_segcblist_add_len(rsclp, rclp->len);
|
||||
rclp->len_lazy = 0;
|
||||
rclp->len = 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -15,15 +15,6 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
|
||||
return READ_ONCE(rclp->len);
|
||||
}
|
||||
|
||||
/*
|
||||
* Account for the fact that a previously dequeued callback turned out
|
||||
* to be marked as lazy.
|
||||
*/
|
||||
static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
|
||||
{
|
||||
rclp->len_lazy--;
|
||||
}
|
||||
|
||||
void rcu_cblist_init(struct rcu_cblist *rclp);
|
||||
void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
|
||||
void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
|
||||
@@ -59,18 +50,6 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Return number of lazy callbacks in segmented callback list. */
|
||||
static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return rsclp->len_lazy;
|
||||
}
|
||||
|
||||
/* Return number of lazy callbacks in segmented callback list. */
|
||||
static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy;
|
||||
}
|
||||
|
||||
/*
|
||||
* Is the specified rcu_segcblist enabled, for example, not corresponding
|
||||
* to an offline CPU?
|
||||
@@ -106,9 +85,9 @@ struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
|
||||
struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
|
||||
bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);
|
||||
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
|
||||
struct rcu_head *rhp, bool lazy);
|
||||
struct rcu_head *rhp);
|
||||
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
|
||||
struct rcu_head *rhp, bool lazy);
|
||||
struct rcu_head *rhp);
|
||||
void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
|
||||
struct rcu_cblist *rclp);
|
||||
void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
|
||||
|
||||
@@ -86,6 +86,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
|
||||
"Shutdown at end of performance tests.");
|
||||
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
|
||||
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
|
||||
torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?");
|
||||
|
||||
static char *perf_type = "rcu";
|
||||
module_param(perf_type, charp, 0444);
|
||||
@@ -105,8 +106,8 @@ static atomic_t n_rcu_perf_writer_finished;
|
||||
static wait_queue_head_t shutdown_wq;
|
||||
static u64 t_rcu_perf_writer_started;
|
||||
static u64 t_rcu_perf_writer_finished;
|
||||
static unsigned long b_rcu_perf_writer_started;
|
||||
static unsigned long b_rcu_perf_writer_finished;
|
||||
static unsigned long b_rcu_gp_test_started;
|
||||
static unsigned long b_rcu_gp_test_finished;
|
||||
static DEFINE_PER_CPU(atomic_t, n_async_inflight);
|
||||
|
||||
#define MAX_MEAS 10000
|
||||
@@ -378,10 +379,10 @@ rcu_perf_writer(void *arg)
|
||||
if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
|
||||
t_rcu_perf_writer_started = t;
|
||||
if (gp_exp) {
|
||||
b_rcu_perf_writer_started =
|
||||
b_rcu_gp_test_started =
|
||||
cur_ops->exp_completed() / 2;
|
||||
} else {
|
||||
b_rcu_perf_writer_started = cur_ops->get_gp_seq();
|
||||
b_rcu_gp_test_started = cur_ops->get_gp_seq();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -429,10 +430,10 @@ retry:
|
||||
PERFOUT_STRING("Test complete");
|
||||
t_rcu_perf_writer_finished = t;
|
||||
if (gp_exp) {
|
||||
b_rcu_perf_writer_finished =
|
||||
b_rcu_gp_test_finished =
|
||||
cur_ops->exp_completed() / 2;
|
||||
} else {
|
||||
b_rcu_perf_writer_finished =
|
||||
b_rcu_gp_test_finished =
|
||||
cur_ops->get_gp_seq();
|
||||
}
|
||||
if (shutdown) {
|
||||
@@ -515,8 +516,8 @@ rcu_perf_cleanup(void)
|
||||
t_rcu_perf_writer_finished -
|
||||
t_rcu_perf_writer_started,
|
||||
ngps,
|
||||
rcuperf_seq_diff(b_rcu_perf_writer_finished,
|
||||
b_rcu_perf_writer_started));
|
||||
rcuperf_seq_diff(b_rcu_gp_test_finished,
|
||||
b_rcu_gp_test_started));
|
||||
for (i = 0; i < nrealwriters; i++) {
|
||||
if (!writer_durations)
|
||||
break;
|
||||
@@ -584,6 +585,159 @@ rcu_perf_shutdown(void *arg)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number
|
||||
* of iterations and measure total time and number of GP for all iterations to complete.
|
||||
*/
|
||||
|
||||
torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
|
||||
torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
|
||||
torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
|
||||
|
||||
static struct task_struct **kfree_reader_tasks;
|
||||
static int kfree_nrealthreads;
|
||||
static atomic_t n_kfree_perf_thread_started;
|
||||
static atomic_t n_kfree_perf_thread_ended;
|
||||
|
||||
struct kfree_obj {
|
||||
char kfree_obj[8];
|
||||
struct rcu_head rh;
|
||||
};
|
||||
|
||||
static int
|
||||
kfree_perf_thread(void *arg)
|
||||
{
|
||||
int i, loop = 0;
|
||||
long me = (long)arg;
|
||||
struct kfree_obj *alloc_ptr;
|
||||
u64 start_time, end_time;
|
||||
|
||||
VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
|
||||
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
|
||||
set_user_nice(current, MAX_NICE);
|
||||
|
||||
start_time = ktime_get_mono_fast_ns();
|
||||
|
||||
if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) {
|
||||
if (gp_exp)
|
||||
b_rcu_gp_test_started = cur_ops->exp_completed() / 2;
|
||||
else
|
||||
b_rcu_gp_test_started = cur_ops->get_gp_seq();
|
||||
}
|
||||
|
||||
do {
|
||||
for (i = 0; i < kfree_alloc_num; i++) {
|
||||
alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
|
||||
if (!alloc_ptr)
|
||||
return -ENOMEM;
|
||||
|
||||
kfree_rcu(alloc_ptr, rh);
|
||||
}
|
||||
|
||||
cond_resched();
|
||||
} while (!torture_must_stop() && ++loop < kfree_loops);
|
||||
|
||||
if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) {
|
||||
end_time = ktime_get_mono_fast_ns();
|
||||
|
||||
if (gp_exp)
|
||||
b_rcu_gp_test_finished = cur_ops->exp_completed() / 2;
|
||||
else
|
||||
b_rcu_gp_test_finished = cur_ops->get_gp_seq();
|
||||
|
||||
pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n",
|
||||
(unsigned long long)(end_time - start_time), kfree_loops,
|
||||
rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started));
|
||||
if (shutdown) {
|
||||
smp_mb(); /* Assign before wake. */
|
||||
wake_up(&shutdown_wq);
|
||||
}
|
||||
}
|
||||
|
||||
torture_kthread_stopping("kfree_perf_thread");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
kfree_perf_cleanup(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (torture_cleanup_begin())
|
||||
return;
|
||||
|
||||
if (kfree_reader_tasks) {
|
||||
for (i = 0; i < kfree_nrealthreads; i++)
|
||||
torture_stop_kthread(kfree_perf_thread,
|
||||
kfree_reader_tasks[i]);
|
||||
kfree(kfree_reader_tasks);
|
||||
}
|
||||
|
||||
torture_cleanup_end();
|
||||
}
|
||||
|
||||
/*
|
||||
* shutdown kthread. Just waits to be awakened, then shuts down system.
|
||||
*/
|
||||
static int
|
||||
kfree_perf_shutdown(void *arg)
|
||||
{
|
||||
do {
|
||||
wait_event(shutdown_wq,
|
||||
atomic_read(&n_kfree_perf_thread_ended) >=
|
||||
kfree_nrealthreads);
|
||||
} while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
|
||||
|
||||
smp_mb(); /* Wake before output. */
|
||||
|
||||
kfree_perf_cleanup();
|
||||
kernel_power_off();
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int __init
|
||||
kfree_perf_init(void)
|
||||
{
|
||||
long i;
|
||||
int firsterr = 0;
|
||||
|
||||
kfree_nrealthreads = compute_real(kfree_nthreads);
|
||||
/* Start up the kthreads. */
|
||||
if (shutdown) {
|
||||
init_waitqueue_head(&shutdown_wq);
|
||||
firsterr = torture_create_kthread(kfree_perf_shutdown, NULL,
|
||||
shutdown_task);
|
||||
if (firsterr)
|
||||
goto unwind;
|
||||
schedule_timeout_uninterruptible(1);
|
||||
}
|
||||
|
||||
kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
|
||||
GFP_KERNEL);
|
||||
if (kfree_reader_tasks == NULL) {
|
||||
firsterr = -ENOMEM;
|
||||
goto unwind;
|
||||
}
|
||||
|
||||
for (i = 0; i < kfree_nrealthreads; i++) {
|
||||
firsterr = torture_create_kthread(kfree_perf_thread, (void *)i,
|
||||
kfree_reader_tasks[i]);
|
||||
if (firsterr)
|
||||
goto unwind;
|
||||
}
|
||||
|
||||
while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads)
|
||||
schedule_timeout_uninterruptible(1);
|
||||
|
||||
torture_init_end();
|
||||
return 0;
|
||||
|
||||
unwind:
|
||||
torture_init_end();
|
||||
kfree_perf_cleanup();
|
||||
return firsterr;
|
||||
}
|
||||
|
||||
static int __init
|
||||
rcu_perf_init(void)
|
||||
{
|
||||
@@ -616,6 +770,9 @@ rcu_perf_init(void)
|
||||
if (cur_ops->init)
|
||||
cur_ops->init();
|
||||
|
||||
if (kfree_rcu_test)
|
||||
return kfree_perf_init();
|
||||
|
||||
nrealwriters = compute_real(nwriters);
|
||||
nrealreaders = compute_real(nreaders);
|
||||
atomic_set(&n_rcu_perf_reader_started, 0);
|
||||
|
||||
@@ -1661,43 +1661,52 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp)
|
||||
struct rcu_fwd_cb {
|
||||
struct rcu_head rh;
|
||||
struct rcu_fwd_cb *rfc_next;
|
||||
struct rcu_fwd *rfc_rfp;
|
||||
int rfc_gps;
|
||||
};
|
||||
static DEFINE_SPINLOCK(rcu_fwd_lock);
|
||||
static struct rcu_fwd_cb *rcu_fwd_cb_head;
|
||||
static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head;
|
||||
static long n_launders_cb;
|
||||
static unsigned long rcu_fwd_startat;
|
||||
static bool rcu_fwd_emergency_stop;
|
||||
|
||||
#define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */
|
||||
#define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */
|
||||
#define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */
|
||||
#define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */
|
||||
#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
|
||||
|
||||
struct rcu_launder_hist {
|
||||
long n_launders;
|
||||
unsigned long launder_gp_seq;
|
||||
};
|
||||
#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
|
||||
static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
|
||||
static unsigned long rcu_launder_gp_seq_start;
|
||||
|
||||
static void rcu_torture_fwd_cb_hist(void)
|
||||
struct rcu_fwd {
|
||||
spinlock_t rcu_fwd_lock;
|
||||
struct rcu_fwd_cb *rcu_fwd_cb_head;
|
||||
struct rcu_fwd_cb **rcu_fwd_cb_tail;
|
||||
long n_launders_cb;
|
||||
unsigned long rcu_fwd_startat;
|
||||
struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
|
||||
unsigned long rcu_launder_gp_seq_start;
|
||||
};
|
||||
|
||||
struct rcu_fwd *rcu_fwds;
|
||||
bool rcu_fwd_emergency_stop;
|
||||
|
||||
static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
|
||||
{
|
||||
unsigned long gps;
|
||||
unsigned long gps_old;
|
||||
int i;
|
||||
int j;
|
||||
|
||||
for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
|
||||
if (n_launders_hist[i].n_launders > 0)
|
||||
for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
|
||||
if (rfp->n_launders_hist[i].n_launders > 0)
|
||||
break;
|
||||
pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
|
||||
__func__, jiffies - rcu_fwd_startat);
|
||||
gps_old = rcu_launder_gp_seq_start;
|
||||
__func__, jiffies - rfp->rcu_fwd_startat);
|
||||
gps_old = rfp->rcu_launder_gp_seq_start;
|
||||
for (j = 0; j <= i; j++) {
|
||||
gps = n_launders_hist[j].launder_gp_seq;
|
||||
gps = rfp->n_launders_hist[j].launder_gp_seq;
|
||||
pr_cont(" %ds/%d: %ld:%ld",
|
||||
j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders,
|
||||
j + 1, FWD_CBS_HIST_DIV,
|
||||
rfp->n_launders_hist[j].n_launders,
|
||||
rcutorture_seq_diff(gps, gps_old));
|
||||
gps_old = gps;
|
||||
}
|
||||
@@ -1711,26 +1720,27 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
|
||||
int i;
|
||||
struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh);
|
||||
struct rcu_fwd_cb **rfcpp;
|
||||
struct rcu_fwd *rfp = rfcp->rfc_rfp;
|
||||
|
||||
rfcp->rfc_next = NULL;
|
||||
rfcp->rfc_gps++;
|
||||
spin_lock_irqsave(&rcu_fwd_lock, flags);
|
||||
rfcpp = rcu_fwd_cb_tail;
|
||||
rcu_fwd_cb_tail = &rfcp->rfc_next;
|
||||
spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
|
||||
rfcpp = rfp->rcu_fwd_cb_tail;
|
||||
rfp->rcu_fwd_cb_tail = &rfcp->rfc_next;
|
||||
WRITE_ONCE(*rfcpp, rfcp);
|
||||
WRITE_ONCE(n_launders_cb, n_launders_cb + 1);
|
||||
i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
|
||||
if (i >= ARRAY_SIZE(n_launders_hist))
|
||||
i = ARRAY_SIZE(n_launders_hist) - 1;
|
||||
n_launders_hist[i].n_launders++;
|
||||
n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
|
||||
spin_unlock_irqrestore(&rcu_fwd_lock, flags);
|
||||
WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1);
|
||||
i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
|
||||
if (i >= ARRAY_SIZE(rfp->n_launders_hist))
|
||||
i = ARRAY_SIZE(rfp->n_launders_hist) - 1;
|
||||
rfp->n_launders_hist[i].n_launders++;
|
||||
rfp->n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
|
||||
spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
|
||||
}
|
||||
|
||||
// Give the scheduler a chance, even on nohz_full CPUs.
|
||||
static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
|
||||
if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
|
||||
// Real call_rcu() floods hit userspace, so emulate that.
|
||||
if (need_resched() || (iter & 0xfff))
|
||||
schedule();
|
||||
@@ -1744,23 +1754,23 @@ static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
|
||||
* Free all callbacks on the rcu_fwd_cb_head list, either because the
|
||||
* test is over or because we hit an OOM event.
|
||||
*/
|
||||
static unsigned long rcu_torture_fwd_prog_cbfree(void)
|
||||
static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned long freed = 0;
|
||||
struct rcu_fwd_cb *rfcp;
|
||||
|
||||
for (;;) {
|
||||
spin_lock_irqsave(&rcu_fwd_lock, flags);
|
||||
rfcp = rcu_fwd_cb_head;
|
||||
spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
|
||||
rfcp = rfp->rcu_fwd_cb_head;
|
||||
if (!rfcp) {
|
||||
spin_unlock_irqrestore(&rcu_fwd_lock, flags);
|
||||
spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
|
||||
break;
|
||||
}
|
||||
rcu_fwd_cb_head = rfcp->rfc_next;
|
||||
if (!rcu_fwd_cb_head)
|
||||
rcu_fwd_cb_tail = &rcu_fwd_cb_head;
|
||||
spin_unlock_irqrestore(&rcu_fwd_lock, flags);
|
||||
rfp->rcu_fwd_cb_head = rfcp->rfc_next;
|
||||
if (!rfp->rcu_fwd_cb_head)
|
||||
rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
|
||||
spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
|
||||
kfree(rfcp);
|
||||
freed++;
|
||||
rcu_torture_fwd_prog_cond_resched(freed);
|
||||
@@ -1774,7 +1784,8 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
|
||||
}
|
||||
|
||||
/* Carry out need_resched()/cond_resched() forward-progress testing. */
|
||||
static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
|
||||
static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
|
||||
int *tested, int *tested_tries)
|
||||
{
|
||||
unsigned long cver;
|
||||
unsigned long dur;
|
||||
@@ -1804,8 +1815,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
|
||||
sd = cur_ops->stall_dur() + 1;
|
||||
sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
|
||||
dur = sd4 + torture_random(&trs) % (sd - sd4);
|
||||
WRITE_ONCE(rcu_fwd_startat, jiffies);
|
||||
stopat = rcu_fwd_startat + dur;
|
||||
WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
|
||||
stopat = rfp->rcu_fwd_startat + dur;
|
||||
while (time_before(jiffies, stopat) &&
|
||||
!shutdown_time_arrived() &&
|
||||
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
|
||||
@@ -1840,7 +1851,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
|
||||
}
|
||||
|
||||
/* Carry out call_rcu() forward-progress testing. */
|
||||
static void rcu_torture_fwd_prog_cr(void)
|
||||
static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
|
||||
{
|
||||
unsigned long cver;
|
||||
unsigned long flags;
|
||||
@@ -1864,23 +1875,23 @@ static void rcu_torture_fwd_prog_cr(void)
|
||||
/* Loop continuously posting RCU callbacks. */
|
||||
WRITE_ONCE(rcu_fwd_cb_nodelay, true);
|
||||
cur_ops->sync(); /* Later readers see above write. */
|
||||
WRITE_ONCE(rcu_fwd_startat, jiffies);
|
||||
stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
|
||||
WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
|
||||
stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
|
||||
n_launders = 0;
|
||||
n_launders_cb = 0;
|
||||
rfp->n_launders_cb = 0; // Hoist initialization for multi-kthread
|
||||
n_launders_sa = 0;
|
||||
n_max_cbs = 0;
|
||||
n_max_gps = 0;
|
||||
for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++)
|
||||
n_launders_hist[i].n_launders = 0;
|
||||
for (i = 0; i < ARRAY_SIZE(rfp->n_launders_hist); i++)
|
||||
rfp->n_launders_hist[i].n_launders = 0;
|
||||
cver = READ_ONCE(rcu_torture_current_version);
|
||||
gps = cur_ops->get_gp_seq();
|
||||
rcu_launder_gp_seq_start = gps;
|
||||
rfp->rcu_launder_gp_seq_start = gps;
|
||||
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
|
||||
while (time_before(jiffies, stopat) &&
|
||||
!shutdown_time_arrived() &&
|
||||
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
|
||||
rfcp = READ_ONCE(rcu_fwd_cb_head);
|
||||
rfcp = READ_ONCE(rfp->rcu_fwd_cb_head);
|
||||
rfcpn = NULL;
|
||||
if (rfcp)
|
||||
rfcpn = READ_ONCE(rfcp->rfc_next);
|
||||
@@ -1888,7 +1899,7 @@ static void rcu_torture_fwd_prog_cr(void)
|
||||
if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS &&
|
||||
++n_max_gps >= MIN_FWD_CBS_LAUNDERED)
|
||||
break;
|
||||
rcu_fwd_cb_head = rfcpn;
|
||||
rfp->rcu_fwd_cb_head = rfcpn;
|
||||
n_launders++;
|
||||
n_launders_sa++;
|
||||
} else {
|
||||
@@ -1900,6 +1911,7 @@ static void rcu_torture_fwd_prog_cr(void)
|
||||
n_max_cbs++;
|
||||
n_launders_sa = 0;
|
||||
rfcp->rfc_gps = 0;
|
||||
rfcp->rfc_rfp = rfp;
|
||||
}
|
||||
cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
|
||||
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
|
||||
@@ -1910,22 +1922,22 @@ static void rcu_torture_fwd_prog_cr(void)
|
||||
}
|
||||
}
|
||||
stoppedat = jiffies;
|
||||
n_launders_cb_snap = READ_ONCE(n_launders_cb);
|
||||
n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
|
||||
cver = READ_ONCE(rcu_torture_current_version) - cver;
|
||||
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
|
||||
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
|
||||
(void)rcu_torture_fwd_prog_cbfree();
|
||||
(void)rcu_torture_fwd_prog_cbfree(rfp);
|
||||
|
||||
if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
|
||||
!shutdown_time_arrived()) {
|
||||
WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
|
||||
pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
|
||||
__func__,
|
||||
stoppedat - rcu_fwd_startat, jiffies - stoppedat,
|
||||
stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat,
|
||||
n_launders + n_max_cbs - n_launders_cb_snap,
|
||||
n_launders, n_launders_sa,
|
||||
n_max_gps, n_max_cbs, cver, gps);
|
||||
rcu_torture_fwd_cb_hist();
|
||||
rcu_torture_fwd_cb_hist(rfp);
|
||||
}
|
||||
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
|
||||
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
|
||||
@@ -1940,20 +1952,22 @@ static void rcu_torture_fwd_prog_cr(void)
|
||||
static int rcutorture_oom_notify(struct notifier_block *self,
|
||||
unsigned long notused, void *nfreed)
|
||||
{
|
||||
struct rcu_fwd *rfp = rcu_fwds;
|
||||
|
||||
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
|
||||
__func__);
|
||||
rcu_torture_fwd_cb_hist();
|
||||
rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);
|
||||
rcu_torture_fwd_cb_hist(rfp);
|
||||
rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
|
||||
WRITE_ONCE(rcu_fwd_emergency_stop, true);
|
||||
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
|
||||
pr_info("%s: Freed %lu RCU callbacks.\n",
|
||||
__func__, rcu_torture_fwd_prog_cbfree());
|
||||
__func__, rcu_torture_fwd_prog_cbfree(rfp));
|
||||
rcu_barrier();
|
||||
pr_info("%s: Freed %lu RCU callbacks.\n",
|
||||
__func__, rcu_torture_fwd_prog_cbfree());
|
||||
__func__, rcu_torture_fwd_prog_cbfree(rfp));
|
||||
rcu_barrier();
|
||||
pr_info("%s: Freed %lu RCU callbacks.\n",
|
||||
__func__, rcu_torture_fwd_prog_cbfree());
|
||||
__func__, rcu_torture_fwd_prog_cbfree(rfp));
|
||||
smp_mb(); /* Frees before return to avoid redoing OOM. */
|
||||
(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
|
||||
pr_info("%s returning after OOM processing.\n", __func__);
|
||||
@@ -1967,6 +1981,7 @@ static struct notifier_block rcutorture_oom_nb = {
|
||||
/* Carry out grace-period forward-progress testing. */
|
||||
static int rcu_torture_fwd_prog(void *args)
|
||||
{
|
||||
struct rcu_fwd *rfp = args;
|
||||
int tested = 0;
|
||||
int tested_tries = 0;
|
||||
|
||||
@@ -1978,8 +1993,8 @@ static int rcu_torture_fwd_prog(void *args)
|
||||
schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
|
||||
WRITE_ONCE(rcu_fwd_emergency_stop, false);
|
||||
register_oom_notifier(&rcutorture_oom_nb);
|
||||
rcu_torture_fwd_prog_nr(&tested, &tested_tries);
|
||||
rcu_torture_fwd_prog_cr();
|
||||
rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
|
||||
rcu_torture_fwd_prog_cr(rfp);
|
||||
unregister_oom_notifier(&rcutorture_oom_nb);
|
||||
|
||||
/* Avoid slow periods, better to test when busy. */
|
||||
@@ -1995,6 +2010,8 @@ static int rcu_torture_fwd_prog(void *args)
|
||||
/* If forward-progress checking is requested and feasible, spawn the thread. */
|
||||
static int __init rcu_torture_fwd_prog_init(void)
|
||||
{
|
||||
struct rcu_fwd *rfp;
|
||||
|
||||
if (!fwd_progress)
|
||||
return 0; /* Not requested, so don't do it. */
|
||||
if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 ||
|
||||
@@ -2013,8 +2030,12 @@ static int __init rcu_torture_fwd_prog_init(void)
|
||||
fwd_progress_holdoff = 1;
|
||||
if (fwd_progress_div <= 0)
|
||||
fwd_progress_div = 4;
|
||||
return torture_create_kthread(rcu_torture_fwd_prog,
|
||||
NULL, fwd_prog_task);
|
||||
rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
|
||||
if (!rfp)
|
||||
return -ENOMEM;
|
||||
spin_lock_init(&rfp->rcu_fwd_lock);
|
||||
rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
|
||||
return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
|
||||
}
|
||||
|
||||
/* Callback function for RCU barrier testing. */
|
||||
|
||||
@@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
|
||||
|
||||
/*
|
||||
* Workqueue handler to drive one grace period and invoke any callbacks
|
||||
* that become ready as a result. Single-CPU and !PREEMPT operation
|
||||
* that become ready as a result. Single-CPU and !PREEMPTION operation
|
||||
* means that we get away with murder on synchronization. ;-)
|
||||
*/
|
||||
void srcu_drive_gp(struct work_struct *wp)
|
||||
|
||||
@@ -530,7 +530,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
|
||||
idx = rcu_seq_state(ssp->srcu_gp_seq);
|
||||
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
|
||||
cbdelay = srcu_get_delay(ssp);
|
||||
ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
|
||||
WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
|
||||
rcu_seq_end(&ssp->srcu_gp_seq);
|
||||
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
|
||||
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
|
||||
@@ -762,6 +762,7 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
|
||||
unsigned long flags;
|
||||
struct srcu_data *sdp;
|
||||
unsigned long t;
|
||||
unsigned long tlast;
|
||||
|
||||
/* If the local srcu_data structure has callbacks, not idle. */
|
||||
local_irq_save(flags);
|
||||
@@ -780,9 +781,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
|
||||
|
||||
/* First, see if enough time has passed since the last GP. */
|
||||
t = ktime_get_mono_fast_ns();
|
||||
tlast = READ_ONCE(ssp->srcu_last_gp_end);
|
||||
if (exp_holdoff == 0 ||
|
||||
time_in_range_open(t, ssp->srcu_last_gp_end,
|
||||
ssp->srcu_last_gp_end + exp_holdoff))
|
||||
time_in_range_open(t, tlast, tlast + exp_holdoff))
|
||||
return false; /* Too soon after last GP. */
|
||||
|
||||
/* Next, check for probable idleness. */
|
||||
@@ -853,7 +854,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
|
||||
local_irq_save(flags);
|
||||
sdp = this_cpu_ptr(ssp->sda);
|
||||
spin_lock_rcu_node(sdp);
|
||||
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
|
||||
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
|
||||
rcu_segcblist_advance(&sdp->srcu_cblist,
|
||||
rcu_seq_current(&ssp->srcu_gp_seq));
|
||||
s = rcu_seq_snap(&ssp->srcu_gp_seq);
|
||||
@@ -1052,7 +1053,7 @@ void srcu_barrier(struct srcu_struct *ssp)
|
||||
sdp->srcu_barrier_head.func = srcu_barrier_cb;
|
||||
debug_rcu_head_queue(&sdp->srcu_barrier_head);
|
||||
if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
|
||||
&sdp->srcu_barrier_head, 0)) {
|
||||
&sdp->srcu_barrier_head)) {
|
||||
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
|
||||
atomic_dec(&ssp->srcu_barrier_cpu_cnt);
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include <linux/time.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "rcu.h"
|
||||
|
||||
@@ -73,6 +74,31 @@ void rcu_sched_clock_irq(int user)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Reclaim the specified callback, either by invoking it for non-kfree cases or
|
||||
* freeing it directly (for kfree). Return true if kfreeing, false otherwise.
|
||||
*/
|
||||
static inline bool rcu_reclaim_tiny(struct rcu_head *head)
|
||||
{
|
||||
rcu_callback_t f;
|
||||
unsigned long offset = (unsigned long)head->func;
|
||||
|
||||
rcu_lock_acquire(&rcu_callback_map);
|
||||
if (__is_kfree_rcu_offset(offset)) {
|
||||
trace_rcu_invoke_kfree_callback("", head, offset);
|
||||
kfree((void *)head - offset);
|
||||
rcu_lock_release(&rcu_callback_map);
|
||||
return true;
|
||||
}
|
||||
|
||||
trace_rcu_invoke_callback("", head);
|
||||
f = head->func;
|
||||
WRITE_ONCE(head->func, (rcu_callback_t)0L);
|
||||
f(head);
|
||||
rcu_lock_release(&rcu_callback_map);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Invoke the RCU callbacks whose grace period has elapsed. */
|
||||
static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
|
||||
{
|
||||
@@ -100,7 +126,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
|
||||
prefetch(next);
|
||||
debug_rcu_head_unqueue(list);
|
||||
local_bh_disable();
|
||||
__rcu_reclaim("", list);
|
||||
rcu_reclaim_tiny(list);
|
||||
local_bh_enable();
|
||||
list = next;
|
||||
}
|
||||
|
||||
@@ -43,7 +43,6 @@
|
||||
#include <uapi/linux/sched/types.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/stop_machine.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/trace_events.h>
|
||||
#include <linux/suspend.h>
|
||||
@@ -55,6 +54,7 @@
|
||||
#include <linux/oom.h>
|
||||
#include <linux/smpboot.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <linux/sched/clock.h>
|
||||
#include "../time/tick-internal.h"
|
||||
@@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
|
||||
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
|
||||
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
|
||||
};
|
||||
struct rcu_state rcu_state = {
|
||||
static struct rcu_state rcu_state = {
|
||||
.level = { &rcu_state.node[0] },
|
||||
.gp_state = RCU_GP_IDLE,
|
||||
.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
|
||||
@@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
|
||||
* held, but the bit corresponding to the current CPU will be stable
|
||||
* in most contexts.
|
||||
*/
|
||||
unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
|
||||
static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
|
||||
{
|
||||
return READ_ONCE(rnp->qsmaskinitnext);
|
||||
}
|
||||
@@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void)
|
||||
*
|
||||
* No ordering, as we are sampling CPU-local information.
|
||||
*/
|
||||
bool rcu_dynticks_curr_cpu_in_eqs(void)
|
||||
static bool rcu_dynticks_curr_cpu_in_eqs(void)
|
||||
{
|
||||
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
|
||||
|
||||
@@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
|
||||
* Snapshot the ->dynticks counter with full ordering so as to allow
|
||||
* stable comparison of this counter with past and future snapshots.
|
||||
*/
|
||||
int rcu_dynticks_snap(struct rcu_data *rdp)
|
||||
static int rcu_dynticks_snap(struct rcu_data *rdp)
|
||||
{
|
||||
int snap = atomic_add_return(0, &rdp->dynticks);
|
||||
|
||||
@@ -528,16 +528,6 @@ static struct rcu_node *rcu_get_root(void)
|
||||
return &rcu_state.node[0];
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a ->gp_state value to a character string.
|
||||
*/
|
||||
static const char *gp_state_getname(short gs)
|
||||
{
|
||||
if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
|
||||
return "???";
|
||||
return gp_state_names[gs];
|
||||
}
|
||||
|
||||
/*
|
||||
* Send along grace-period-related data for rcutorture diagnostics.
|
||||
*/
|
||||
@@ -577,7 +567,7 @@ static void rcu_eqs_enter(bool user)
|
||||
}
|
||||
|
||||
lockdep_assert_irqs_disabled();
|
||||
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks);
|
||||
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
|
||||
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
|
||||
rdp = this_cpu_ptr(&rcu_data);
|
||||
do_nocb_deferred_wakeup(rdp);
|
||||
@@ -650,14 +640,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
|
||||
* leave it in non-RCU-idle state.
|
||||
*/
|
||||
if (rdp->dynticks_nmi_nesting != 1) {
|
||||
trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
|
||||
trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
|
||||
atomic_read(&rdp->dynticks));
|
||||
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
|
||||
rdp->dynticks_nmi_nesting - 2);
|
||||
return;
|
||||
}
|
||||
|
||||
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
|
||||
trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks);
|
||||
trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
|
||||
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
|
||||
|
||||
if (irq)
|
||||
@@ -744,7 +735,7 @@ static void rcu_eqs_exit(bool user)
|
||||
rcu_dynticks_task_exit();
|
||||
rcu_dynticks_eqs_exit();
|
||||
rcu_cleanup_after_idle();
|
||||
trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks);
|
||||
trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
|
||||
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
|
||||
WRITE_ONCE(rdp->dynticks_nesting, 1);
|
||||
WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
|
||||
@@ -800,8 +791,8 @@ void rcu_user_exit(void)
|
||||
*/
|
||||
static __always_inline void rcu_nmi_enter_common(bool irq)
|
||||
{
|
||||
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
|
||||
long incby = 2;
|
||||
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
|
||||
|
||||
/* Complain about underflow. */
|
||||
WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
|
||||
@@ -828,12 +819,17 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
|
||||
} else if (tick_nohz_full_cpu(rdp->cpu) &&
|
||||
rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
|
||||
READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
|
||||
rdp->rcu_forced_tick = true;
|
||||
tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
|
||||
raw_spin_lock_rcu_node(rdp->mynode);
|
||||
// Recheck under lock.
|
||||
if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
|
||||
rdp->rcu_forced_tick = true;
|
||||
tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
|
||||
}
|
||||
raw_spin_unlock_rcu_node(rdp->mynode);
|
||||
}
|
||||
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
|
||||
rdp->dynticks_nmi_nesting,
|
||||
rdp->dynticks_nmi_nesting + incby, rdp->dynticks);
|
||||
rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
|
||||
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
|
||||
rdp->dynticks_nmi_nesting + incby);
|
||||
barrier();
|
||||
@@ -898,6 +894,7 @@ void rcu_irq_enter_irqson(void)
|
||||
*/
|
||||
static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
|
||||
{
|
||||
raw_lockdep_assert_held_rcu_node(rdp->mynode);
|
||||
WRITE_ONCE(rdp->rcu_urgent_qs, false);
|
||||
WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
|
||||
if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
|
||||
@@ -1934,7 +1931,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
|
||||
struct rcu_node *rnp_p;
|
||||
|
||||
raw_lockdep_assert_held_rcu_node(rnp);
|
||||
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
|
||||
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
|
||||
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
|
||||
rnp->qsmask != 0) {
|
||||
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
|
||||
@@ -2146,7 +2143,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
|
||||
/* If no callbacks are ready, just return. */
|
||||
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
|
||||
trace_rcu_batch_start(rcu_state.name,
|
||||
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
|
||||
rcu_segcblist_n_cbs(&rdp->cblist), 0);
|
||||
trace_rcu_batch_end(rcu_state.name, 0,
|
||||
!rcu_segcblist_empty(&rdp->cblist),
|
||||
@@ -2168,7 +2164,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
|
||||
if (unlikely(bl > 100))
|
||||
tlimit = local_clock() + rcu_resched_ns;
|
||||
trace_rcu_batch_start(rcu_state.name,
|
||||
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
|
||||
rcu_segcblist_n_cbs(&rdp->cblist), bl);
|
||||
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
|
||||
if (offloaded)
|
||||
@@ -2179,9 +2174,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
|
||||
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
|
||||
rhp = rcu_cblist_dequeue(&rcl);
|
||||
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
|
||||
rcu_callback_t f;
|
||||
|
||||
debug_rcu_head_unqueue(rhp);
|
||||
if (__rcu_reclaim(rcu_state.name, rhp))
|
||||
rcu_cblist_dequeued_lazy(&rcl);
|
||||
|
||||
rcu_lock_acquire(&rcu_callback_map);
|
||||
trace_rcu_invoke_callback(rcu_state.name, rhp);
|
||||
|
||||
f = rhp->func;
|
||||
WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
|
||||
f(rhp);
|
||||
|
||||
rcu_lock_release(&rcu_callback_map);
|
||||
|
||||
/*
|
||||
* Stop only if limit reached and CPU has something to do.
|
||||
* Note: The rcl structure counts down from zero.
|
||||
@@ -2294,7 +2299,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
|
||||
mask = 0;
|
||||
raw_spin_lock_irqsave_rcu_node(rnp, flags);
|
||||
if (rnp->qsmask == 0) {
|
||||
if (!IS_ENABLED(CONFIG_PREEMPTION) ||
|
||||
if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
|
||||
rcu_preempt_blocked_readers_cgp(rnp)) {
|
||||
/*
|
||||
* No point in scanning bits because they
|
||||
@@ -2308,14 +2313,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
|
||||
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
|
||||
continue;
|
||||
}
|
||||
for_each_leaf_node_possible_cpu(rnp, cpu) {
|
||||
unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
|
||||
if ((rnp->qsmask & bit) != 0) {
|
||||
rdp = per_cpu_ptr(&rcu_data, cpu);
|
||||
if (f(rdp)) {
|
||||
mask |= bit;
|
||||
rcu_disable_urgency_upon_qs(rdp);
|
||||
}
|
||||
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
|
||||
rdp = per_cpu_ptr(&rcu_data, cpu);
|
||||
if (f(rdp)) {
|
||||
mask |= rdp->grpmask;
|
||||
rcu_disable_urgency_upon_qs(rdp);
|
||||
}
|
||||
}
|
||||
if (mask != 0) {
|
||||
@@ -2474,8 +2476,8 @@ static void rcu_cpu_kthread(unsigned int cpu)
|
||||
char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
|
||||
int spincnt;
|
||||
|
||||
trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
|
||||
for (spincnt = 0; spincnt < 10; spincnt++) {
|
||||
trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
|
||||
local_bh_disable();
|
||||
*statusp = RCU_KTHREAD_RUNNING;
|
||||
local_irq_disable();
|
||||
@@ -2583,7 +2585,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)
|
||||
* is expected to specify a CPU.
|
||||
*/
|
||||
static void
|
||||
__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
|
||||
__call_rcu(struct rcu_head *head, rcu_callback_t func)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct rcu_data *rdp;
|
||||
@@ -2618,18 +2620,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
|
||||
if (rcu_segcblist_empty(&rdp->cblist))
|
||||
rcu_segcblist_init(&rdp->cblist);
|
||||
}
|
||||
|
||||
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
|
||||
return; // Enqueued onto ->nocb_bypass, so just leave.
|
||||
/* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
|
||||
rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
|
||||
rcu_segcblist_enqueue(&rdp->cblist, head);
|
||||
if (__is_kfree_rcu_offset((unsigned long)func))
|
||||
trace_rcu_kfree_callback(rcu_state.name, head,
|
||||
(unsigned long)func,
|
||||
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
|
||||
rcu_segcblist_n_cbs(&rdp->cblist));
|
||||
else
|
||||
trace_rcu_callback(rcu_state.name, head,
|
||||
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
|
||||
rcu_segcblist_n_cbs(&rdp->cblist));
|
||||
|
||||
/* Go handle any RCU core processing required. */
|
||||
@@ -2679,28 +2680,230 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
|
||||
*/
|
||||
void call_rcu(struct rcu_head *head, rcu_callback_t func)
|
||||
{
|
||||
__call_rcu(head, func, 0);
|
||||
__call_rcu(head, func);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(call_rcu);
|
||||
|
||||
|
||||
/* Maximum number of jiffies to wait before draining a batch. */
|
||||
#define KFREE_DRAIN_JIFFIES (HZ / 50)
|
||||
#define KFREE_N_BATCHES 2
|
||||
|
||||
/**
|
||||
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
|
||||
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
|
||||
* @head_free: List of kfree_rcu() objects waiting for a grace period
|
||||
* @krcp: Pointer to @kfree_rcu_cpu structure
|
||||
*/
|
||||
|
||||
struct kfree_rcu_cpu_work {
|
||||
struct rcu_work rcu_work;
|
||||
struct rcu_head *head_free;
|
||||
struct kfree_rcu_cpu *krcp;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
|
||||
* @head: List of kfree_rcu() objects not yet waiting for a grace period
|
||||
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
|
||||
* @lock: Synchronize access to this structure
|
||||
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
|
||||
* @monitor_todo: Tracks whether a @monitor_work delayed work is pending
|
||||
* @initialized: The @lock and @rcu_work fields have been initialized
|
||||
*
|
||||
* This is a per-CPU structure. The reason that it is not included in
|
||||
* the rcu_data structure is to permit this code to be extracted from
|
||||
* the RCU files. Such extraction could allow further optimization of
|
||||
* the interactions with the slab allocators.
|
||||
*/
|
||||
struct kfree_rcu_cpu {
|
||||
struct rcu_head *head;
|
||||
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
|
||||
spinlock_t lock;
|
||||
struct delayed_work monitor_work;
|
||||
bool monitor_todo;
|
||||
bool initialized;
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
|
||||
|
||||
/*
|
||||
* Queue an RCU callback for lazy invocation after a grace period.
|
||||
* This will likely be later named something like "call_rcu_lazy()",
|
||||
* but this change will require some way of tagging the lazy RCU
|
||||
* callbacks in the list of pending callbacks. Until then, this
|
||||
* function may only be called from __kfree_rcu().
|
||||
* This function is invoked in workqueue context after a grace period.
|
||||
* It frees all the objects queued on ->head_free.
|
||||
*/
|
||||
static void kfree_rcu_work(struct work_struct *work)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct rcu_head *head, *next;
|
||||
struct kfree_rcu_cpu *krcp;
|
||||
struct kfree_rcu_cpu_work *krwp;
|
||||
|
||||
krwp = container_of(to_rcu_work(work),
|
||||
struct kfree_rcu_cpu_work, rcu_work);
|
||||
krcp = krwp->krcp;
|
||||
spin_lock_irqsave(&krcp->lock, flags);
|
||||
head = krwp->head_free;
|
||||
krwp->head_free = NULL;
|
||||
spin_unlock_irqrestore(&krcp->lock, flags);
|
||||
|
||||
// List "head" is now private, so traverse locklessly.
|
||||
for (; head; head = next) {
|
||||
unsigned long offset = (unsigned long)head->func;
|
||||
|
||||
next = head->next;
|
||||
// Potentially optimize with kfree_bulk in future.
|
||||
debug_rcu_head_unqueue(head);
|
||||
rcu_lock_acquire(&rcu_callback_map);
|
||||
trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
|
||||
|
||||
if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
|
||||
/* Could be optimized with kfree_bulk() in future. */
|
||||
kfree((void *)head - offset);
|
||||
}
|
||||
|
||||
rcu_lock_release(&rcu_callback_map);
|
||||
cond_resched_tasks_rcu_qs();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Schedule the kfree batch RCU work to run in workqueue context after a GP.
|
||||
*
|
||||
* This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
|
||||
* timeout has been reached.
|
||||
*/
|
||||
static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
|
||||
{
|
||||
int i;
|
||||
struct kfree_rcu_cpu_work *krwp = NULL;
|
||||
|
||||
lockdep_assert_held(&krcp->lock);
|
||||
for (i = 0; i < KFREE_N_BATCHES; i++)
|
||||
if (!krcp->krw_arr[i].head_free) {
|
||||
krwp = &(krcp->krw_arr[i]);
|
||||
break;
|
||||
}
|
||||
|
||||
// If a previous RCU batch is in progress, we cannot immediately
|
||||
// queue another one, so return false to tell caller to retry.
|
||||
if (!krwp)
|
||||
return false;
|
||||
|
||||
krwp->head_free = krcp->head;
|
||||
krcp->head = NULL;
|
||||
INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
|
||||
queue_rcu_work(system_wq, &krwp->rcu_work);
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
|
||||
unsigned long flags)
|
||||
{
|
||||
// Attempt to start a new batch.
|
||||
krcp->monitor_todo = false;
|
||||
if (queue_kfree_rcu_work(krcp)) {
|
||||
// Success! Our job is done here.
|
||||
spin_unlock_irqrestore(&krcp->lock, flags);
|
||||
return;
|
||||
}
|
||||
|
||||
// Previous RCU batch still in progress, try again later.
|
||||
krcp->monitor_todo = true;
|
||||
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
|
||||
spin_unlock_irqrestore(&krcp->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
|
||||
* It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
|
||||
*/
|
||||
static void kfree_rcu_monitor(struct work_struct *work)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
|
||||
monitor_work.work);
|
||||
|
||||
spin_lock_irqsave(&krcp->lock, flags);
|
||||
if (krcp->monitor_todo)
|
||||
kfree_rcu_drain_unlock(krcp, flags);
|
||||
else
|
||||
spin_unlock_irqrestore(&krcp->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Queue a request for lazy invocation of kfree() after a grace period.
|
||||
*
|
||||
* Each kfree_call_rcu() request is added to a batch. The batch will be drained
|
||||
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch
|
||||
* will be kfree'd in workqueue context. This allows us to:
|
||||
*
|
||||
* 1. Batch requests together to reduce the number of grace periods during
|
||||
* heavy kfree_rcu() load.
|
||||
*
|
||||
* 2. It makes it possible to use kfree_bulk() on a large number of
|
||||
* kfree_rcu() requests thus reducing cache misses and the per-object
|
||||
* overhead of kfree().
|
||||
*/
|
||||
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
|
||||
{
|
||||
__call_rcu(head, func, 1);
|
||||
unsigned long flags;
|
||||
struct kfree_rcu_cpu *krcp;
|
||||
|
||||
local_irq_save(flags); // For safely calling this_cpu_ptr().
|
||||
krcp = this_cpu_ptr(&krc);
|
||||
if (krcp->initialized)
|
||||
spin_lock(&krcp->lock);
|
||||
|
||||
// Queue the object but don't yet schedule the batch.
|
||||
if (debug_rcu_head_queue(head)) {
|
||||
// Probable double kfree_rcu(), just leak.
|
||||
WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
|
||||
__func__, head);
|
||||
goto unlock_return;
|
||||
}
|
||||
head->func = func;
|
||||
head->next = krcp->head;
|
||||
krcp->head = head;
|
||||
|
||||
// Set timer to drain after KFREE_DRAIN_JIFFIES.
|
||||
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
|
||||
!krcp->monitor_todo) {
|
||||
krcp->monitor_todo = true;
|
||||
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
|
||||
}
|
||||
|
||||
unlock_return:
|
||||
if (krcp->initialized)
|
||||
spin_unlock(&krcp->lock);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kfree_call_rcu);
|
||||
|
||||
void __init kfree_rcu_scheduler_running(void)
|
||||
{
|
||||
int cpu;
|
||||
unsigned long flags;
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
|
||||
|
||||
spin_lock_irqsave(&krcp->lock, flags);
|
||||
if (!krcp->head || krcp->monitor_todo) {
|
||||
spin_unlock_irqrestore(&krcp->lock, flags);
|
||||
continue;
|
||||
}
|
||||
krcp->monitor_todo = true;
|
||||
schedule_delayed_work_on(cpu, &krcp->monitor_work,
|
||||
KFREE_DRAIN_JIFFIES);
|
||||
spin_unlock_irqrestore(&krcp->lock, flags);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* During early boot, any blocking grace-period wait automatically
|
||||
* implies a grace period. Later on, this is never the case for PREEMPT.
|
||||
* implies a grace period. Later on, this is never the case for PREEMPTION.
|
||||
*
|
||||
* Howevr, because a context switch is a grace period for !PREEMPT, any
|
||||
* Howevr, because a context switch is a grace period for !PREEMPTION, any
|
||||
* blocking grace-period wait automatically implies a grace period if
|
||||
* there is only one CPU online at any point time during execution of
|
||||
* either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
|
||||
@@ -2896,7 +3099,7 @@ static void rcu_barrier_func(void *unused)
|
||||
debug_rcu_head_queue(&rdp->barrier_head);
|
||||
rcu_nocb_lock(rdp);
|
||||
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
|
||||
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
|
||||
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
|
||||
atomic_inc(&rcu_state.barrier_cpu_count);
|
||||
} else {
|
||||
debug_rcu_head_unqueue(&rdp->barrier_head);
|
||||
@@ -3557,12 +3760,29 @@ static void __init rcu_dump_rcu_node_tree(void)
|
||||
struct workqueue_struct *rcu_gp_wq;
|
||||
struct workqueue_struct *rcu_par_gp_wq;
|
||||
|
||||
static void __init kfree_rcu_batch_init(void)
|
||||
{
|
||||
int cpu;
|
||||
int i;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
|
||||
|
||||
spin_lock_init(&krcp->lock);
|
||||
for (i = 0; i < KFREE_N_BATCHES; i++)
|
||||
krcp->krw_arr[i].krcp = krcp;
|
||||
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
|
||||
krcp->initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void __init rcu_init(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
rcu_early_boot_tests();
|
||||
|
||||
kfree_rcu_batch_init();
|
||||
rcu_bootup_announce();
|
||||
rcu_init_geometry();
|
||||
rcu_init_one();
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/swait.h>
|
||||
#include <linux/stop_machine.h>
|
||||
#include <linux/rcu_node_tree.h>
|
||||
|
||||
#include "rcu_segcblist.h"
|
||||
@@ -182,8 +181,8 @@ struct rcu_data {
|
||||
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
|
||||
bool rcu_urgent_qs; /* GP old need light quiescent state. */
|
||||
bool rcu_forced_tick; /* Forced tick to provide QS. */
|
||||
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
|
||||
#ifdef CONFIG_RCU_FAST_NO_HZ
|
||||
bool all_lazy; /* All CPU's CBs lazy at idle start? */
|
||||
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
|
||||
unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
|
||||
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
|
||||
@@ -368,18 +367,6 @@ struct rcu_state {
|
||||
#define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */
|
||||
#define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */
|
||||
|
||||
static const char * const gp_state_names[] = {
|
||||
"RCU_GP_IDLE",
|
||||
"RCU_GP_WAIT_GPS",
|
||||
"RCU_GP_DONE_GPS",
|
||||
"RCU_GP_ONOFF",
|
||||
"RCU_GP_INIT",
|
||||
"RCU_GP_WAIT_FQS",
|
||||
"RCU_GP_DOING_FQS",
|
||||
"RCU_GP_CLEANUP",
|
||||
"RCU_GP_CLEANED",
|
||||
};
|
||||
|
||||
/*
|
||||
* In order to export the rcu_state name to the tracing tools, it
|
||||
* needs to be added in the __tracepoint_string section.
|
||||
@@ -403,8 +390,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
|
||||
#define RCU_NAME rcu_name
|
||||
#endif /* #else #ifdef CONFIG_TRACING */
|
||||
|
||||
int rcu_dynticks_snap(struct rcu_data *rdp);
|
||||
|
||||
/* Forward declarations for tree_plugin.h */
|
||||
static void rcu_bootup_announce(void);
|
||||
static void rcu_qs(void);
|
||||
@@ -415,7 +400,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
|
||||
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
|
||||
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
|
||||
static void rcu_flavor_sched_clock_irq(int user);
|
||||
void call_rcu(struct rcu_head *head, rcu_callback_t func);
|
||||
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
|
||||
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
|
||||
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
|
||||
|
||||
@@ -21,7 +21,7 @@ static void rcu_exp_gp_seq_start(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Return then value that expedited-grace-period counter will have
|
||||
* Return the value that the expedited-grace-period counter will have
|
||||
* at the end of the current grace period.
|
||||
*/
|
||||
static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
|
||||
@@ -39,7 +39,9 @@ static void rcu_exp_gp_seq_end(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Take a snapshot of the expedited-grace-period counter.
|
||||
* Take a snapshot of the expedited-grace-period counter, which is the
|
||||
* earliest value that will indicate that a full grace period has
|
||||
* elapsed since the current time.
|
||||
*/
|
||||
static unsigned long rcu_exp_gp_seq_snap(void)
|
||||
{
|
||||
@@ -134,7 +136,7 @@ static void __maybe_unused sync_exp_reset_tree(void)
|
||||
rcu_for_each_node_breadth_first(rnp) {
|
||||
raw_spin_lock_irqsave_rcu_node(rnp, flags);
|
||||
WARN_ON_ONCE(rnp->expmask);
|
||||
rnp->expmask = rnp->expmaskinit;
|
||||
WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
|
||||
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
|
||||
}
|
||||
}
|
||||
@@ -143,31 +145,26 @@ static void __maybe_unused sync_exp_reset_tree(void)
|
||||
* Return non-zero if there is no RCU expedited grace period in progress
|
||||
* for the specified rcu_node structure, in other words, if all CPUs and
|
||||
* tasks covered by the specified rcu_node structure have done their bit
|
||||
* for the current expedited grace period. Works only for preemptible
|
||||
* RCU -- other RCU implementation use other means.
|
||||
*
|
||||
* Caller must hold the specificed rcu_node structure's ->lock
|
||||
* for the current expedited grace period.
|
||||
*/
|
||||
static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
|
||||
static bool sync_rcu_exp_done(struct rcu_node *rnp)
|
||||
{
|
||||
raw_lockdep_assert_held_rcu_node(rnp);
|
||||
|
||||
return rnp->exp_tasks == NULL &&
|
||||
READ_ONCE(rnp->expmask) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Like sync_rcu_preempt_exp_done(), but this function assumes the caller
|
||||
* doesn't hold the rcu_node's ->lock, and will acquire and release the lock
|
||||
* itself
|
||||
* Like sync_rcu_exp_done(), but where the caller does not hold the
|
||||
* rcu_node's ->lock.
|
||||
*/
|
||||
static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
|
||||
static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
|
||||
{
|
||||
unsigned long flags;
|
||||
bool ret;
|
||||
|
||||
raw_spin_lock_irqsave_rcu_node(rnp, flags);
|
||||
ret = sync_rcu_preempt_exp_done(rnp);
|
||||
ret = sync_rcu_exp_done(rnp);
|
||||
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
|
||||
|
||||
return ret;
|
||||
@@ -181,8 +178,6 @@ static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
|
||||
* which the task was queued or to one of that rcu_node structure's ancestors,
|
||||
* recursively up the tree. (Calm down, calm down, we do the recursion
|
||||
* iteratively!)
|
||||
*
|
||||
* Caller must hold the specified rcu_node structure's ->lock.
|
||||
*/
|
||||
static void __rcu_report_exp_rnp(struct rcu_node *rnp,
|
||||
bool wake, unsigned long flags)
|
||||
@@ -190,8 +185,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
|
||||
{
|
||||
unsigned long mask;
|
||||
|
||||
raw_lockdep_assert_held_rcu_node(rnp);
|
||||
for (;;) {
|
||||
if (!sync_rcu_preempt_exp_done(rnp)) {
|
||||
if (!sync_rcu_exp_done(rnp)) {
|
||||
if (!rnp->expmask)
|
||||
rcu_initiate_boost(rnp, flags);
|
||||
else
|
||||
@@ -211,7 +207,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
|
||||
rnp = rnp->parent;
|
||||
raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
|
||||
WARN_ON_ONCE(!(rnp->expmask & mask));
|
||||
rnp->expmask &= ~mask;
|
||||
WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -234,14 +230,23 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
|
||||
static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
|
||||
unsigned long mask, bool wake)
|
||||
{
|
||||
int cpu;
|
||||
unsigned long flags;
|
||||
struct rcu_data *rdp;
|
||||
|
||||
raw_spin_lock_irqsave_rcu_node(rnp, flags);
|
||||
if (!(rnp->expmask & mask)) {
|
||||
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
|
||||
return;
|
||||
}
|
||||
rnp->expmask &= ~mask;
|
||||
WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
|
||||
for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
|
||||
rdp = per_cpu_ptr(&rcu_data, cpu);
|
||||
if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp)
|
||||
continue;
|
||||
rdp->rcu_forced_tick_exp = false;
|
||||
tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
|
||||
}
|
||||
__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
|
||||
}
|
||||
|
||||
@@ -345,8 +350,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
|
||||
/* Each pass checks a CPU for identity, offline, and idle. */
|
||||
mask_ofl_test = 0;
|
||||
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
|
||||
unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
|
||||
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
|
||||
unsigned long mask = rdp->grpmask;
|
||||
int snap;
|
||||
|
||||
if (raw_smp_processor_id() == cpu ||
|
||||
@@ -372,12 +377,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
|
||||
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
|
||||
|
||||
/* IPI the remaining CPUs for expedited quiescent state. */
|
||||
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
|
||||
unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
|
||||
for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) {
|
||||
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
|
||||
unsigned long mask = rdp->grpmask;
|
||||
|
||||
if (!(mask_ofl_ipi & mask))
|
||||
continue;
|
||||
retry_ipi:
|
||||
if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {
|
||||
mask_ofl_test |= mask;
|
||||
@@ -389,10 +392,10 @@ retry_ipi:
|
||||
}
|
||||
ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
|
||||
put_cpu();
|
||||
if (!ret) {
|
||||
mask_ofl_ipi &= ~mask;
|
||||
/* The CPU will report the QS in response to the IPI. */
|
||||
if (!ret)
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Failed, raced with CPU hotplug operation. */
|
||||
raw_spin_lock_irqsave_rcu_node(rnp, flags);
|
||||
if ((rnp->qsmaskinitnext & mask) &&
|
||||
@@ -403,13 +406,12 @@ retry_ipi:
|
||||
schedule_timeout_uninterruptible(1);
|
||||
goto retry_ipi;
|
||||
}
|
||||
/* CPU really is offline, so we can ignore it. */
|
||||
if (!(rnp->expmask & mask))
|
||||
mask_ofl_ipi &= ~mask;
|
||||
/* CPU really is offline, so we must report its QS. */
|
||||
if (rnp->expmask & mask)
|
||||
mask_ofl_test |= mask;
|
||||
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
|
||||
}
|
||||
/* Report quiescent states for those that went offline. */
|
||||
mask_ofl_test |= mask_ofl_ipi;
|
||||
if (mask_ofl_test)
|
||||
rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
|
||||
}
|
||||
@@ -456,29 +458,62 @@ static void sync_rcu_exp_select_cpus(void)
|
||||
flush_work(&rnp->rew.rew_work);
|
||||
}
|
||||
|
||||
static void synchronize_sched_expedited_wait(void)
|
||||
/*
|
||||
* Wait for the expedited grace period to elapse, within time limit.
|
||||
* If the time limit is exceeded without the grace period elapsing,
|
||||
* return false, otherwise return true.
|
||||
*/
|
||||
static bool synchronize_rcu_expedited_wait_once(long tlimit)
|
||||
{
|
||||
int t;
|
||||
struct rcu_node *rnp_root = rcu_get_root();
|
||||
|
||||
t = swait_event_timeout_exclusive(rcu_state.expedited_wq,
|
||||
sync_rcu_exp_done_unlocked(rnp_root),
|
||||
tlimit);
|
||||
// Workqueues should not be signaled.
|
||||
if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root))
|
||||
return true;
|
||||
WARN_ON(t < 0); /* workqueues should not be signaled. */
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for the expedited grace period to elapse, issuing any needed
|
||||
* RCU CPU stall warnings along the way.
|
||||
*/
|
||||
static void synchronize_rcu_expedited_wait(void)
|
||||
{
|
||||
int cpu;
|
||||
unsigned long jiffies_stall;
|
||||
unsigned long jiffies_start;
|
||||
unsigned long mask;
|
||||
int ndetected;
|
||||
struct rcu_data *rdp;
|
||||
struct rcu_node *rnp;
|
||||
struct rcu_node *rnp_root = rcu_get_root();
|
||||
int ret;
|
||||
|
||||
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
|
||||
jiffies_stall = rcu_jiffies_till_stall_check();
|
||||
jiffies_start = jiffies;
|
||||
if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
|
||||
if (synchronize_rcu_expedited_wait_once(1))
|
||||
return;
|
||||
rcu_for_each_leaf_node(rnp) {
|
||||
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
|
||||
rdp = per_cpu_ptr(&rcu_data, cpu);
|
||||
if (rdp->rcu_forced_tick_exp)
|
||||
continue;
|
||||
rdp->rcu_forced_tick_exp = true;
|
||||
tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
|
||||
}
|
||||
}
|
||||
WARN_ON_ONCE(1);
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
ret = swait_event_timeout_exclusive(
|
||||
rcu_state.expedited_wq,
|
||||
sync_rcu_preempt_exp_done_unlocked(rnp_root),
|
||||
jiffies_stall);
|
||||
if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root))
|
||||
if (synchronize_rcu_expedited_wait_once(jiffies_stall))
|
||||
return;
|
||||
WARN_ON(ret < 0); /* workqueues should not be signaled. */
|
||||
if (rcu_cpu_stall_suppress)
|
||||
continue;
|
||||
panic_on_rcu_stall();
|
||||
@@ -491,7 +526,7 @@ static void synchronize_sched_expedited_wait(void)
|
||||
struct rcu_data *rdp;
|
||||
|
||||
mask = leaf_node_cpu_bit(rnp, cpu);
|
||||
if (!(rnp->expmask & mask))
|
||||
if (!(READ_ONCE(rnp->expmask) & mask))
|
||||
continue;
|
||||
ndetected++;
|
||||
rdp = per_cpu_ptr(&rcu_data, cpu);
|
||||
@@ -503,17 +538,18 @@ static void synchronize_sched_expedited_wait(void)
|
||||
}
|
||||
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
|
||||
jiffies - jiffies_start, rcu_state.expedited_sequence,
|
||||
rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
|
||||
READ_ONCE(rnp_root->expmask),
|
||||
".T"[!!rnp_root->exp_tasks]);
|
||||
if (ndetected) {
|
||||
pr_err("blocking rcu_node structures:");
|
||||
rcu_for_each_node_breadth_first(rnp) {
|
||||
if (rnp == rnp_root)
|
||||
continue; /* printed unconditionally */
|
||||
if (sync_rcu_preempt_exp_done_unlocked(rnp))
|
||||
if (sync_rcu_exp_done_unlocked(rnp))
|
||||
continue;
|
||||
pr_cont(" l=%u:%d-%d:%#lx/%c",
|
||||
rnp->level, rnp->grplo, rnp->grphi,
|
||||
rnp->expmask,
|
||||
READ_ONCE(rnp->expmask),
|
||||
".T"[!!rnp->exp_tasks]);
|
||||
}
|
||||
pr_cont("\n");
|
||||
@@ -521,7 +557,7 @@ static void synchronize_sched_expedited_wait(void)
|
||||
rcu_for_each_leaf_node(rnp) {
|
||||
for_each_leaf_node_possible_cpu(rnp, cpu) {
|
||||
mask = leaf_node_cpu_bit(rnp, cpu);
|
||||
if (!(rnp->expmask & mask))
|
||||
if (!(READ_ONCE(rnp->expmask) & mask))
|
||||
continue;
|
||||
dump_cpu_task(cpu);
|
||||
}
|
||||
@@ -540,16 +576,15 @@ static void rcu_exp_wait_wake(unsigned long s)
|
||||
{
|
||||
struct rcu_node *rnp;
|
||||
|
||||
synchronize_sched_expedited_wait();
|
||||
synchronize_rcu_expedited_wait();
|
||||
|
||||
// Switch over to wakeup mode, allowing the next GP to proceed.
|
||||
// End the previous grace period only after acquiring the mutex
|
||||
// to ensure that only one GP runs concurrently with wakeups.
|
||||
mutex_lock(&rcu_state.exp_wake_mutex);
|
||||
rcu_exp_gp_seq_end();
|
||||
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
|
||||
|
||||
/*
|
||||
* Switch over to wakeup mode, allowing the next GP, but -only- the
|
||||
* next GP, to proceed.
|
||||
*/
|
||||
mutex_lock(&rcu_state.exp_wake_mutex);
|
||||
|
||||
rcu_for_each_node_breadth_first(rnp) {
|
||||
if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
|
||||
spin_lock(&rnp->exp_lock);
|
||||
@@ -559,7 +594,7 @@ static void rcu_exp_wait_wake(unsigned long s)
|
||||
spin_unlock(&rnp->exp_lock);
|
||||
}
|
||||
smp_mb(); /* All above changes before wakeup. */
|
||||
wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]);
|
||||
wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]);
|
||||
}
|
||||
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake"));
|
||||
mutex_unlock(&rcu_state.exp_wake_mutex);
|
||||
@@ -610,7 +645,7 @@ static void rcu_exp_handler(void *unused)
|
||||
* critical section. If also enabled or idle, immediately
|
||||
* report the quiescent state, otherwise defer.
|
||||
*/
|
||||
if (!t->rcu_read_lock_nesting) {
|
||||
if (!rcu_preempt_depth()) {
|
||||
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
|
||||
rcu_dynticks_curr_cpu_in_eqs()) {
|
||||
rcu_report_exp_rdp(rdp);
|
||||
@@ -634,7 +669,7 @@ static void rcu_exp_handler(void *unused)
|
||||
* can have caused this quiescent state to already have been
|
||||
* reported, so we really do need to check ->expmask.
|
||||
*/
|
||||
if (t->rcu_read_lock_nesting > 0) {
|
||||
if (rcu_preempt_depth() > 0) {
|
||||
raw_spin_lock_irqsave_rcu_node(rnp, flags);
|
||||
if (rnp->expmask & rdp->grpmask) {
|
||||
rdp->exp_deferred_qs = true;
|
||||
@@ -670,7 +705,7 @@ static void rcu_exp_handler(void *unused)
|
||||
}
|
||||
}
|
||||
|
||||
/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */
|
||||
/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */
|
||||
static void sync_sched_exp_online_cleanup(int cpu)
|
||||
{
|
||||
}
|
||||
@@ -785,7 +820,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
|
||||
* implementations, it is still unfriendly to real-time workloads, so is
|
||||
* thus not recommended for any sort of common-case code. In fact, if
|
||||
* you are using synchronize_rcu_expedited() in a loop, please restructure
|
||||
* your code to batch your updates, and then Use a single synchronize_rcu()
|
||||
* your code to batch your updates, and then use a single synchronize_rcu()
|
||||
* instead.
|
||||
*
|
||||
* This has the same semantics as (but is more brutal than) synchronize_rcu().
|
||||
|
||||
@@ -220,7 +220,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
|
||||
* blocked tasks.
|
||||
*/
|
||||
if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
|
||||
rnp->gp_tasks = &t->rcu_node_entry;
|
||||
WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
|
||||
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
|
||||
}
|
||||
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
|
||||
@@ -290,8 +290,8 @@ void rcu_note_context_switch(bool preempt)
|
||||
|
||||
trace_rcu_utilization(TPS("Start context switch"));
|
||||
lockdep_assert_irqs_disabled();
|
||||
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
|
||||
if (t->rcu_read_lock_nesting > 0 &&
|
||||
WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
|
||||
if (rcu_preempt_depth() > 0 &&
|
||||
!t->rcu_read_unlock_special.b.blocked) {
|
||||
|
||||
/* Possibly blocking in an RCU read-side critical section. */
|
||||
@@ -340,7 +340,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
|
||||
*/
|
||||
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
|
||||
{
|
||||
return rnp->gp_tasks != NULL;
|
||||
return READ_ONCE(rnp->gp_tasks) != NULL;
|
||||
}
|
||||
|
||||
/* Bias and limit values for ->rcu_read_lock_nesting. */
|
||||
@@ -348,6 +348,21 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
|
||||
#define RCU_NEST_NMAX (-INT_MAX / 2)
|
||||
#define RCU_NEST_PMAX (INT_MAX / 2)
|
||||
|
||||
static void rcu_preempt_read_enter(void)
|
||||
{
|
||||
current->rcu_read_lock_nesting++;
|
||||
}
|
||||
|
||||
static void rcu_preempt_read_exit(void)
|
||||
{
|
||||
current->rcu_read_lock_nesting--;
|
||||
}
|
||||
|
||||
static void rcu_preempt_depth_set(int val)
|
||||
{
|
||||
current->rcu_read_lock_nesting = val;
|
||||
}
|
||||
|
||||
/*
|
||||
* Preemptible RCU implementation for rcu_read_lock().
|
||||
* Just increment ->rcu_read_lock_nesting, shared state will be updated
|
||||
@@ -355,9 +370,9 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
|
||||
*/
|
||||
void __rcu_read_lock(void)
|
||||
{
|
||||
current->rcu_read_lock_nesting++;
|
||||
rcu_preempt_read_enter();
|
||||
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
|
||||
WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX);
|
||||
WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
|
||||
barrier(); /* critical section after entry code. */
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__rcu_read_lock);
|
||||
@@ -373,19 +388,19 @@ void __rcu_read_unlock(void)
|
||||
{
|
||||
struct task_struct *t = current;
|
||||
|
||||
if (t->rcu_read_lock_nesting != 1) {
|
||||
--t->rcu_read_lock_nesting;
|
||||
if (rcu_preempt_depth() != 1) {
|
||||
rcu_preempt_read_exit();
|
||||
} else {
|
||||
barrier(); /* critical section before exit code. */
|
||||
t->rcu_read_lock_nesting = -RCU_NEST_BIAS;
|
||||
rcu_preempt_depth_set(-RCU_NEST_BIAS);
|
||||
barrier(); /* assign before ->rcu_read_unlock_special load */
|
||||
if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
|
||||
rcu_read_unlock_special(t);
|
||||
barrier(); /* ->rcu_read_unlock_special load before assign */
|
||||
t->rcu_read_lock_nesting = 0;
|
||||
rcu_preempt_depth_set(0);
|
||||
}
|
||||
if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
|
||||
int rrln = t->rcu_read_lock_nesting;
|
||||
int rrln = rcu_preempt_depth();
|
||||
|
||||
WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX);
|
||||
}
|
||||
@@ -444,15 +459,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
|
||||
local_irq_restore(flags);
|
||||
return;
|
||||
}
|
||||
t->rcu_read_unlock_special.b.deferred_qs = false;
|
||||
if (special.b.need_qs) {
|
||||
t->rcu_read_unlock_special.s = 0;
|
||||
if (special.b.need_qs)
|
||||
rcu_qs();
|
||||
t->rcu_read_unlock_special.b.need_qs = false;
|
||||
if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) {
|
||||
local_irq_restore(flags);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Respond to a request by an expedited grace period for a
|
||||
@@ -460,17 +469,11 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
|
||||
* tasks are handled when removing the task from the
|
||||
* blocked-tasks list below.
|
||||
*/
|
||||
if (rdp->exp_deferred_qs) {
|
||||
if (rdp->exp_deferred_qs)
|
||||
rcu_report_exp_rdp(rdp);
|
||||
if (!t->rcu_read_unlock_special.s) {
|
||||
local_irq_restore(flags);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Clean up if blocked during RCU read-side critical section. */
|
||||
if (special.b.blocked) {
|
||||
t->rcu_read_unlock_special.b.blocked = false;
|
||||
|
||||
/*
|
||||
* Remove this task from the list it blocked on. The task
|
||||
@@ -485,7 +488,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
|
||||
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
|
||||
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
|
||||
(!empty_norm || rnp->qsmask));
|
||||
empty_exp = sync_rcu_preempt_exp_done(rnp);
|
||||
empty_exp = sync_rcu_exp_done(rnp);
|
||||
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
|
||||
np = rcu_next_node_entry(t, rnp);
|
||||
list_del_init(&t->rcu_node_entry);
|
||||
@@ -493,7 +496,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
|
||||
trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
|
||||
rnp->gp_seq, t->pid);
|
||||
if (&t->rcu_node_entry == rnp->gp_tasks)
|
||||
rnp->gp_tasks = np;
|
||||
WRITE_ONCE(rnp->gp_tasks, np);
|
||||
if (&t->rcu_node_entry == rnp->exp_tasks)
|
||||
rnp->exp_tasks = np;
|
||||
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
|
||||
@@ -509,7 +512,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
|
||||
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
|
||||
* so we must take a snapshot of the expedited state.
|
||||
*/
|
||||
empty_exp_now = sync_rcu_preempt_exp_done(rnp);
|
||||
empty_exp_now = sync_rcu_exp_done(rnp);
|
||||
if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
|
||||
trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
|
||||
rnp->gp_seq,
|
||||
@@ -551,7 +554,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
|
||||
{
|
||||
return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
|
||||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
|
||||
t->rcu_read_lock_nesting <= 0;
|
||||
rcu_preempt_depth() <= 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -564,16 +567,16 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
|
||||
static void rcu_preempt_deferred_qs(struct task_struct *t)
|
||||
{
|
||||
unsigned long flags;
|
||||
bool couldrecurse = t->rcu_read_lock_nesting >= 0;
|
||||
bool couldrecurse = rcu_preempt_depth() >= 0;
|
||||
|
||||
if (!rcu_preempt_need_deferred_qs(t))
|
||||
return;
|
||||
if (couldrecurse)
|
||||
t->rcu_read_lock_nesting -= RCU_NEST_BIAS;
|
||||
rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS);
|
||||
local_irq_save(flags);
|
||||
rcu_preempt_deferred_qs_irqrestore(t, flags);
|
||||
if (couldrecurse)
|
||||
t->rcu_read_lock_nesting += RCU_NEST_BIAS;
|
||||
rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -610,9 +613,8 @@ static void rcu_read_unlock_special(struct task_struct *t)
|
||||
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
|
||||
struct rcu_node *rnp = rdp->mynode;
|
||||
|
||||
t->rcu_read_unlock_special.b.exp_hint = false;
|
||||
exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) ||
|
||||
(rdp->grpmask & rnp->expmask) ||
|
||||
(rdp->grpmask & READ_ONCE(rnp->expmask)) ||
|
||||
tick_nohz_full_cpu(rdp->cpu);
|
||||
// Need to defer quiescent state until everything is enabled.
|
||||
if (irqs_were_disabled && use_softirq &&
|
||||
@@ -640,7 +642,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
|
||||
local_irq_restore(flags);
|
||||
return;
|
||||
}
|
||||
WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
|
||||
rcu_preempt_deferred_qs_irqrestore(t, flags);
|
||||
}
|
||||
|
||||
@@ -648,8 +649,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
|
||||
* Check that the list of blocked tasks for the newly completed grace
|
||||
* period is in fact empty. It is a serious bug to complete a grace
|
||||
* period that still has RCU readers blocked! This function must be
|
||||
* invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock
|
||||
* must be held by the caller.
|
||||
* invoked -before- updating this rnp's ->gp_seq.
|
||||
*
|
||||
* Also, if there are blocked tasks on the list, they automatically
|
||||
* block the newly created grace period, so set up ->gp_tasks accordingly.
|
||||
@@ -659,11 +659,12 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
|
||||
struct task_struct *t;
|
||||
|
||||
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
|
||||
raw_lockdep_assert_held_rcu_node(rnp);
|
||||
if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
|
||||
dump_blkd_tasks(rnp, 10);
|
||||
if (rcu_preempt_has_tasks(rnp) &&
|
||||
(rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
|
||||
rnp->gp_tasks = rnp->blkd_tasks.next;
|
||||
WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
|
||||
t = container_of(rnp->gp_tasks, struct task_struct,
|
||||
rcu_node_entry);
|
||||
trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
|
||||
@@ -686,7 +687,7 @@ static void rcu_flavor_sched_clock_irq(int user)
|
||||
if (user || rcu_is_cpu_rrupt_from_idle()) {
|
||||
rcu_note_voluntary_context_switch(current);
|
||||
}
|
||||
if (t->rcu_read_lock_nesting > 0 ||
|
||||
if (rcu_preempt_depth() > 0 ||
|
||||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
|
||||
/* No QS, force context switch if deferred. */
|
||||
if (rcu_preempt_need_deferred_qs(t)) {
|
||||
@@ -696,13 +697,13 @@ static void rcu_flavor_sched_clock_irq(int user)
|
||||
} else if (rcu_preempt_need_deferred_qs(t)) {
|
||||
rcu_preempt_deferred_qs(t); /* Report deferred QS. */
|
||||
return;
|
||||
} else if (!t->rcu_read_lock_nesting) {
|
||||
} else if (!rcu_preempt_depth()) {
|
||||
rcu_qs(); /* Report immediate QS. */
|
||||
return;
|
||||
}
|
||||
|
||||
/* If GP is oldish, ask for help from rcu_read_unlock_special(). */
|
||||
if (t->rcu_read_lock_nesting > 0 &&
|
||||
if (rcu_preempt_depth() > 0 &&
|
||||
__this_cpu_read(rcu_data.core_needs_qs) &&
|
||||
__this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
|
||||
!t->rcu_read_unlock_special.b.need_qs &&
|
||||
@@ -723,11 +724,11 @@ void exit_rcu(void)
|
||||
struct task_struct *t = current;
|
||||
|
||||
if (unlikely(!list_empty(¤t->rcu_node_entry))) {
|
||||
t->rcu_read_lock_nesting = 1;
|
||||
rcu_preempt_depth_set(1);
|
||||
barrier();
|
||||
WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
|
||||
} else if (unlikely(t->rcu_read_lock_nesting)) {
|
||||
t->rcu_read_lock_nesting = 1;
|
||||
} else if (unlikely(rcu_preempt_depth())) {
|
||||
rcu_preempt_depth_set(1);
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
@@ -757,7 +758,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
|
||||
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
|
||||
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
|
||||
pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
|
||||
__func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks);
|
||||
__func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks,
|
||||
rnp->exp_tasks);
|
||||
pr_info("%s: ->blkd_tasks", __func__);
|
||||
i = 0;
|
||||
list_for_each(lhp, &rnp->blkd_tasks) {
|
||||
@@ -788,7 +790,7 @@ static void __init rcu_bootup_announce(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Note a quiescent state for PREEMPT=n. Because we do not need to know
|
||||
* Note a quiescent state for PREEMPTION=n. Because we do not need to know
|
||||
* how many quiescent states passed, just if there was at least one since
|
||||
* the start of the grace period, this just sets a flag. The caller must
|
||||
* have disabled preemption.
|
||||
@@ -838,7 +840,7 @@ void rcu_all_qs(void)
|
||||
EXPORT_SYMBOL_GPL(rcu_all_qs);
|
||||
|
||||
/*
|
||||
* Note a PREEMPT=n context switch. The caller must have disabled interrupts.
|
||||
* Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
|
||||
*/
|
||||
void rcu_note_context_switch(bool preempt)
|
||||
{
|
||||
@@ -1262,10 +1264,9 @@ static void rcu_prepare_for_idle(void)
|
||||
/*
|
||||
* This code is invoked when a CPU goes idle, at which point we want
|
||||
* to have the CPU do everything required for RCU so that it can enter
|
||||
* the energy-efficient dyntick-idle mode. This is handled by a
|
||||
* state machine implemented by rcu_prepare_for_idle() below.
|
||||
* the energy-efficient dyntick-idle mode.
|
||||
*
|
||||
* The following three proprocessor symbols control this state machine:
|
||||
* The following preprocessor symbol controls this:
|
||||
*
|
||||
* RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
|
||||
* to sleep in dyntick-idle mode with RCU callbacks pending. This
|
||||
@@ -1274,21 +1275,15 @@ static void rcu_prepare_for_idle(void)
|
||||
* number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
|
||||
* system. And if you are -that- concerned about energy efficiency,
|
||||
* just power the system down and be done with it!
|
||||
* RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
|
||||
* permitted to sleep in dyntick-idle mode with only lazy RCU
|
||||
* callbacks pending. Setting this too high can OOM your system.
|
||||
*
|
||||
* The values below work well in practice. If future workloads require
|
||||
* The value below works well in practice. If future workloads require
|
||||
* adjustment, they can be converted into kernel config parameters, though
|
||||
* making the state machine smarter might be a better option.
|
||||
*/
|
||||
#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
|
||||
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
|
||||
|
||||
static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
|
||||
module_param(rcu_idle_gp_delay, int, 0644);
|
||||
static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
|
||||
module_param(rcu_idle_lazy_gp_delay, int, 0644);
|
||||
|
||||
/*
|
||||
* Try to advance callbacks on the current CPU, but only if it has been
|
||||
@@ -1327,8 +1322,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
|
||||
/*
|
||||
* Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
|
||||
* to invoke. If the CPU has callbacks, try to advance them. Tell the
|
||||
* caller to set the timeout based on whether or not there are non-lazy
|
||||
* callbacks.
|
||||
* caller about what to set the timeout.
|
||||
*
|
||||
* The caller must have disabled interrupts.
|
||||
*/
|
||||
@@ -1354,25 +1348,18 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
|
||||
}
|
||||
rdp->last_accelerate = jiffies;
|
||||
|
||||
/* Request timer delay depending on laziness, and round. */
|
||||
rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist);
|
||||
if (rdp->all_lazy) {
|
||||
dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
|
||||
} else {
|
||||
dj = round_up(rcu_idle_gp_delay + jiffies,
|
||||
rcu_idle_gp_delay) - jiffies;
|
||||
}
|
||||
/* Request timer and round. */
|
||||
dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
|
||||
|
||||
*nextevt = basemono + dj * TICK_NSEC;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prepare a CPU for idle from an RCU perspective. The first major task
|
||||
* is to sense whether nohz mode has been enabled or disabled via sysfs.
|
||||
* The second major task is to check to see if a non-lazy callback has
|
||||
* arrived at a CPU that previously had only lazy callbacks. The third
|
||||
* major task is to accelerate (that is, assign grace-period numbers to)
|
||||
* any recently arrived callbacks.
|
||||
* Prepare a CPU for idle from an RCU perspective. The first major task is to
|
||||
* sense whether nohz mode has been enabled or disabled via sysfs. The second
|
||||
* major task is to accelerate (that is, assign grace-period numbers to) any
|
||||
* recently arrived callbacks.
|
||||
*
|
||||
* The caller must have disabled interrupts.
|
||||
*/
|
||||
@@ -1398,17 +1385,6 @@ static void rcu_prepare_for_idle(void)
|
||||
if (!tne)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If a non-lazy callback arrived at a CPU having only lazy
|
||||
* callbacks, invoke RCU core for the side-effect of recalculating
|
||||
* idle duration on re-entry to idle.
|
||||
*/
|
||||
if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) {
|
||||
rdp->all_lazy = false;
|
||||
invoke_rcu_core();
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we have not yet accelerated this jiffy, accelerate all
|
||||
* callbacks on this CPU.
|
||||
@@ -2321,6 +2297,8 @@ static void __init rcu_organize_nocb_kthreads(void)
|
||||
{
|
||||
int cpu;
|
||||
bool firsttime = true;
|
||||
bool gotnocbs = false;
|
||||
bool gotnocbscbs = true;
|
||||
int ls = rcu_nocb_gp_stride;
|
||||
int nl = 0; /* Next GP kthread. */
|
||||
struct rcu_data *rdp;
|
||||
@@ -2343,21 +2321,31 @@ static void __init rcu_organize_nocb_kthreads(void)
|
||||
rdp = per_cpu_ptr(&rcu_data, cpu);
|
||||
if (rdp->cpu >= nl) {
|
||||
/* New GP kthread, set up for CBs & next GP. */
|
||||
gotnocbs = true;
|
||||
nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
|
||||
rdp->nocb_gp_rdp = rdp;
|
||||
rdp_gp = rdp;
|
||||
if (!firsttime && dump_tree)
|
||||
pr_cont("\n");
|
||||
firsttime = false;
|
||||
pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu);
|
||||
if (dump_tree) {
|
||||
if (!firsttime)
|
||||
pr_cont("%s\n", gotnocbscbs
|
||||
? "" : " (self only)");
|
||||
gotnocbscbs = false;
|
||||
firsttime = false;
|
||||
pr_alert("%s: No-CB GP kthread CPU %d:",
|
||||
__func__, cpu);
|
||||
}
|
||||
} else {
|
||||
/* Another CB kthread, link to previous GP kthread. */
|
||||
gotnocbscbs = true;
|
||||
rdp->nocb_gp_rdp = rdp_gp;
|
||||
rdp_prev->nocb_next_cb_rdp = rdp;
|
||||
pr_alert(" %d", cpu);
|
||||
if (dump_tree)
|
||||
pr_cont(" %d", cpu);
|
||||
}
|
||||
rdp_prev = rdp;
|
||||
}
|
||||
if (gotnocbs && dump_tree)
|
||||
pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp)
|
||||
//
|
||||
// Printing RCU CPU stall warnings
|
||||
|
||||
#ifdef CONFIG_PREEMPTION
|
||||
#ifdef CONFIG_PREEMPT_RCU
|
||||
|
||||
/*
|
||||
* Dump detailed information for all tasks blocking the current RCU
|
||||
@@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
|
||||
return ndetected;
|
||||
}
|
||||
|
||||
#else /* #ifdef CONFIG_PREEMPTION */
|
||||
#else /* #ifdef CONFIG_PREEMPT_RCU */
|
||||
|
||||
/*
|
||||
* Because preemptible RCU does not exist, we never have to check for
|
||||
@@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* #else #ifdef CONFIG_PREEMPTION */
|
||||
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
|
||||
|
||||
/*
|
||||
* Dump stacks of all tasks running on stalled CPUs. First try using
|
||||
@@ -263,11 +263,9 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
|
||||
{
|
||||
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
|
||||
|
||||
sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
|
||||
sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
|
||||
rdp->last_accelerate & 0xffff, jiffies & 0xffff,
|
||||
".l"[rdp->all_lazy],
|
||||
".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
|
||||
".D"[!!rdp->tick_nohz_enabled_snap]);
|
||||
!!rdp->tick_nohz_enabled_snap);
|
||||
}
|
||||
|
||||
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
|
||||
@@ -279,6 +277,28 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
|
||||
|
||||
#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
|
||||
|
||||
static const char * const gp_state_names[] = {
|
||||
[RCU_GP_IDLE] = "RCU_GP_IDLE",
|
||||
[RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
|
||||
[RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS",
|
||||
[RCU_GP_ONOFF] = "RCU_GP_ONOFF",
|
||||
[RCU_GP_INIT] = "RCU_GP_INIT",
|
||||
[RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS",
|
||||
[RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS",
|
||||
[RCU_GP_CLEANUP] = "RCU_GP_CLEANUP",
|
||||
[RCU_GP_CLEANED] = "RCU_GP_CLEANED",
|
||||
};
|
||||
|
||||
/*
|
||||
* Convert a ->gp_state value to a character string.
|
||||
*/
|
||||
static const char *gp_state_getname(short gs)
|
||||
{
|
||||
if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
|
||||
return "???";
|
||||
return gp_state_names[gs];
|
||||
}
|
||||
|
||||
/*
|
||||
* Print out diagnostic information for the specified stalled CPU.
|
||||
*
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
#include <linux/rcupdate_wait.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
|
||||
@@ -51,9 +52,7 @@
|
||||
#define MODULE_PARAM_PREFIX "rcupdate."
|
||||
|
||||
#ifndef CONFIG_TINY_RCU
|
||||
extern int rcu_expedited; /* from sysctl */
|
||||
module_param(rcu_expedited, int, 0);
|
||||
extern int rcu_normal; /* from sysctl */
|
||||
module_param(rcu_normal, int, 0);
|
||||
static int rcu_normal_after_boot;
|
||||
module_param(rcu_normal_after_boot, int, 0);
|
||||
@@ -218,6 +217,7 @@ static int __init rcu_set_runtime_mode(void)
|
||||
{
|
||||
rcu_test_sync_prims();
|
||||
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
|
||||
kfree_rcu_scheduler_running();
|
||||
rcu_test_sync_prims();
|
||||
return 0;
|
||||
}
|
||||
@@ -435,7 +435,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
|
||||
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
|
||||
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
|
||||
|
||||
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
|
||||
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_RCU_TRACE)
|
||||
void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
|
||||
unsigned long secs,
|
||||
unsigned long c_old, unsigned long c)
|
||||
@@ -853,14 +853,22 @@ static void test_callback(struct rcu_head *r)
|
||||
|
||||
DEFINE_STATIC_SRCU(early_srcu);
|
||||
|
||||
struct early_boot_kfree_rcu {
|
||||
struct rcu_head rh;
|
||||
};
|
||||
|
||||
static void early_boot_test_call_rcu(void)
|
||||
{
|
||||
static struct rcu_head head;
|
||||
static struct rcu_head shead;
|
||||
struct early_boot_kfree_rcu *rhp;
|
||||
|
||||
call_rcu(&head, test_callback);
|
||||
if (IS_ENABLED(CONFIG_SRCU))
|
||||
call_srcu(&early_srcu, &shead, test_callback);
|
||||
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
|
||||
if (!WARN_ON_ONCE(!rhp))
|
||||
kfree_rcu(rhp, rh);
|
||||
}
|
||||
|
||||
void rcu_early_boot_tests(void)
|
||||
|
||||
@@ -370,7 +370,7 @@ u64 sched_clock_cpu(int cpu)
|
||||
if (sched_clock_stable())
|
||||
return sched_clock() + __sched_clock_offset;
|
||||
|
||||
if (!static_branch_unlikely(&sched_clock_running))
|
||||
if (!static_branch_likely(&sched_clock_running))
|
||||
return sched_clock();
|
||||
|
||||
preempt_disable_notrace();
|
||||
@@ -393,7 +393,7 @@ void sched_clock_tick(void)
|
||||
if (sched_clock_stable())
|
||||
return;
|
||||
|
||||
if (!static_branch_unlikely(&sched_clock_running))
|
||||
if (!static_branch_likely(&sched_clock_running))
|
||||
return;
|
||||
|
||||
lockdep_assert_irqs_disabled();
|
||||
@@ -460,7 +460,7 @@ void __init sched_clock_init(void)
|
||||
|
||||
u64 sched_clock_cpu(int cpu)
|
||||
{
|
||||
if (!static_branch_unlikely(&sched_clock_running))
|
||||
if (!static_branch_likely(&sched_clock_running))
|
||||
return 0;
|
||||
|
||||
return sched_clock();
|
||||
|
||||
@@ -919,17 +919,17 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
|
||||
return uc_req;
|
||||
}
|
||||
|
||||
unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
|
||||
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
|
||||
{
|
||||
struct uclamp_se uc_eff;
|
||||
|
||||
/* Task currently refcounted: use back-annotated (effective) value */
|
||||
if (p->uclamp[clamp_id].active)
|
||||
return p->uclamp[clamp_id].value;
|
||||
return (unsigned long)p->uclamp[clamp_id].value;
|
||||
|
||||
uc_eff = uclamp_eff_get(p, clamp_id);
|
||||
|
||||
return uc_eff.value;
|
||||
return (unsigned long)uc_eff.value;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1253,7 +1253,8 @@ static void __init init_uclamp(void)
|
||||
mutex_init(&uclamp_mutex);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
|
||||
memset(&cpu_rq(cpu)->uclamp, 0,
|
||||
sizeof(struct uclamp_rq)*UCLAMP_CNT);
|
||||
cpu_rq(cpu)->uclamp_flags = 0;
|
||||
}
|
||||
|
||||
@@ -4504,7 +4505,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
|
||||
void set_user_nice(struct task_struct *p, long nice)
|
||||
{
|
||||
bool queued, running;
|
||||
int old_prio, delta;
|
||||
int old_prio;
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
|
||||
@@ -4538,19 +4539,18 @@ void set_user_nice(struct task_struct *p, long nice)
|
||||
set_load_weight(p, true);
|
||||
old_prio = p->prio;
|
||||
p->prio = effective_prio(p);
|
||||
delta = p->prio - old_prio;
|
||||
|
||||
if (queued) {
|
||||
if (queued)
|
||||
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
|
||||
/*
|
||||
* If the task increased its priority or is running and
|
||||
* lowered its priority, then reschedule its CPU:
|
||||
*/
|
||||
if (delta < 0 || (delta > 0 && task_running(rq, p)))
|
||||
resched_curr(rq);
|
||||
}
|
||||
if (running)
|
||||
set_next_task(rq, p);
|
||||
|
||||
/*
|
||||
* If the task increased its priority or is running and
|
||||
* lowered its priority, then reschedule its CPU:
|
||||
*/
|
||||
p->sched_class->prio_changed(rq, p, old_prio);
|
||||
|
||||
out_unlock:
|
||||
task_rq_unlock(rq, p, &rf);
|
||||
}
|
||||
@@ -7100,6 +7100,12 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
|
||||
|
||||
if (parent)
|
||||
sched_online_group(tg, parent);
|
||||
|
||||
#ifdef CONFIG_UCLAMP_TASK_GROUP
|
||||
/* Propagate the effective uclamp value for the new group */
|
||||
cpu_util_update_eff(css);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -238,7 +238,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
|
||||
*/
|
||||
util = util_cfs + cpu_util_rt(rq);
|
||||
if (type == FREQUENCY_UTIL)
|
||||
util = uclamp_util_with(rq, util, p);
|
||||
util = uclamp_rq_util_with(rq, util, p);
|
||||
|
||||
dl_util = cpu_util_dl(rq);
|
||||
|
||||
|
||||
@@ -46,6 +46,8 @@ static int convert_prio(int prio)
|
||||
* @cp: The cpupri context
|
||||
* @p: The task
|
||||
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
|
||||
* @fitness_fn: A pointer to a function to do custom checks whether the CPU
|
||||
* fits a specific criteria so that we only return those CPUs.
|
||||
*
|
||||
* Note: This function returns the recommended CPUs as calculated during the
|
||||
* current invocation. By the time the call returns, the CPUs may have in
|
||||
@@ -57,7 +59,8 @@ static int convert_prio(int prio)
|
||||
* Return: (int)bool - CPUs were found
|
||||
*/
|
||||
int cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
struct cpumask *lowest_mask)
|
||||
struct cpumask *lowest_mask,
|
||||
bool (*fitness_fn)(struct task_struct *p, int cpu))
|
||||
{
|
||||
int idx = 0;
|
||||
int task_pri = convert_prio(p->prio);
|
||||
@@ -98,6 +101,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
continue;
|
||||
|
||||
if (lowest_mask) {
|
||||
int cpu;
|
||||
|
||||
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
|
||||
|
||||
/*
|
||||
@@ -108,7 +113,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
* condition, simply act as though we never hit this
|
||||
* priority level and continue on.
|
||||
*/
|
||||
if (cpumask_any(lowest_mask) >= nr_cpu_ids)
|
||||
if (cpumask_empty(lowest_mask))
|
||||
continue;
|
||||
|
||||
if (!fitness_fn)
|
||||
return 1;
|
||||
|
||||
/* Ensure the capacity of the CPUs fit the task */
|
||||
for_each_cpu(cpu, lowest_mask) {
|
||||
if (!fitness_fn(p, cpu))
|
||||
cpumask_clear_cpu(cpu, lowest_mask);
|
||||
}
|
||||
|
||||
/*
|
||||
* If no CPU at the current priority can fit the task
|
||||
* continue looking
|
||||
*/
|
||||
if (cpumask_empty(lowest_mask))
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,9 @@ struct cpupri {
|
||||
};
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
|
||||
int cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
struct cpumask *lowest_mask,
|
||||
bool (*fitness_fn)(struct task_struct *p, int cpu));
|
||||
void cpupri_set(struct cpupri *cp, int cpu, int pri);
|
||||
int cpupri_init(struct cpupri *cp);
|
||||
void cpupri_cleanup(struct cpupri *cp);
|
||||
|
||||
@@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
|
||||
* softirq as those do not count in task exec_runtime any more.
|
||||
*/
|
||||
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
||||
struct rq *rq, int ticks)
|
||||
int ticks)
|
||||
{
|
||||
u64 other, cputime = TICK_NSEC * ticks;
|
||||
|
||||
@@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
||||
account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
|
||||
} else if (user_tick) {
|
||||
account_user_time(p, cputime);
|
||||
} else if (p == rq->idle) {
|
||||
} else if (p == this_rq()->idle) {
|
||||
account_idle_time(cputime);
|
||||
} else if (p->flags & PF_VCPU) { /* System time or guest time */
|
||||
account_guest_time(p, cputime);
|
||||
@@ -392,14 +392,12 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
||||
|
||||
static void irqtime_account_idle_ticks(int ticks)
|
||||
{
|
||||
struct rq *rq = this_rq();
|
||||
|
||||
irqtime_account_process_tick(current, 0, rq, ticks);
|
||||
irqtime_account_process_tick(current, 0, ticks);
|
||||
}
|
||||
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
|
||||
static inline void irqtime_account_idle_ticks(int ticks) { }
|
||||
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
||||
struct rq *rq, int nr_ticks) { }
|
||||
int nr_ticks) { }
|
||||
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
|
||||
|
||||
/*
|
||||
@@ -473,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
|
||||
void account_process_tick(struct task_struct *p, int user_tick)
|
||||
{
|
||||
u64 cputime, steal;
|
||||
struct rq *rq = this_rq();
|
||||
|
||||
if (vtime_accounting_enabled_this_cpu())
|
||||
return;
|
||||
|
||||
if (sched_clock_irqtime) {
|
||||
irqtime_account_process_tick(p, user_tick, rq, 1);
|
||||
irqtime_account_process_tick(p, user_tick, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -493,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
|
||||
|
||||
if (user_tick)
|
||||
account_user_time(p, cputime);
|
||||
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
|
||||
else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
|
||||
account_system_time(p, HARDIRQ_OFFSET, cputime);
|
||||
else
|
||||
account_idle_time(cputime);
|
||||
|
||||
@@ -751,9 +751,16 @@ void sysrq_sched_debug_show(void)
|
||||
int cpu;
|
||||
|
||||
sched_debug_header(NULL);
|
||||
for_each_online_cpu(cpu)
|
||||
for_each_online_cpu(cpu) {
|
||||
/*
|
||||
* Need to reset softlockup watchdogs on all CPUs, because
|
||||
* another CPU might be blocked waiting for us to process
|
||||
* an IPI or stop_machine.
|
||||
*/
|
||||
touch_nmi_watchdog();
|
||||
touch_all_softlockup_watchdogs();
|
||||
print_cpu(NULL, cpu);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p)
|
||||
* For !fair tasks do:
|
||||
*
|
||||
update_cfs_rq_load_avg(now, cfs_rq);
|
||||
attach_entity_load_avg(cfs_rq, se, 0);
|
||||
attach_entity_load_avg(cfs_rq, se);
|
||||
switched_from_fair(rq, p);
|
||||
*
|
||||
* such that the next switched_to_fair() has the
|
||||
@@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
|
||||
{
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
|
||||
if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
|
||||
if (&rq->cfs == cfs_rq) {
|
||||
/*
|
||||
* There are a few boundary cases this might miss but it should
|
||||
* get called often enough that that should (hopefully) not be
|
||||
@@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
|
||||
|
||||
runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
|
||||
runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
|
||||
delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
|
||||
delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
|
||||
|
||||
se->avg.runnable_load_sum = runnable_sum;
|
||||
se->avg.runnable_load_avg = runnable_load_avg;
|
||||
|
||||
if (se->on_rq) {
|
||||
delta_sum = runnable_load_sum -
|
||||
se_weight(se) * se->avg.runnable_load_sum;
|
||||
delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
|
||||
add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
|
||||
add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
|
||||
}
|
||||
|
||||
se->avg.runnable_load_sum = runnable_sum;
|
||||
se->avg.runnable_load_avg = runnable_load_avg;
|
||||
}
|
||||
|
||||
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
|
||||
@@ -3520,7 +3521,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
|
||||
* Must call update_cfs_rq_load_avg() before this, since we rely on
|
||||
* cfs_rq->avg.last_update_time being current.
|
||||
*/
|
||||
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
|
||||
|
||||
@@ -3556,7 +3557,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
|
||||
|
||||
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
|
||||
|
||||
cfs_rq_util_change(cfs_rq, flags);
|
||||
cfs_rq_util_change(cfs_rq, 0);
|
||||
|
||||
trace_pelt_cfs_tp(cfs_rq);
|
||||
}
|
||||
@@ -3614,7 +3615,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
|
||||
*
|
||||
* IOW we're enqueueing a task on a new CPU.
|
||||
*/
|
||||
attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
|
||||
attach_entity_load_avg(cfs_rq, se);
|
||||
update_tg_load_avg(cfs_rq, 0);
|
||||
|
||||
} else if (decayed) {
|
||||
@@ -3711,6 +3712,20 @@ static inline unsigned long task_util_est(struct task_struct *p)
|
||||
return max(task_util(p), _task_util_est(p));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
static inline unsigned long uclamp_task_util(struct task_struct *p)
|
||||
{
|
||||
return clamp(task_util_est(p),
|
||||
uclamp_eff_value(p, UCLAMP_MIN),
|
||||
uclamp_eff_value(p, UCLAMP_MAX));
|
||||
}
|
||||
#else
|
||||
static inline unsigned long uclamp_task_util(struct task_struct *p)
|
||||
{
|
||||
return task_util_est(p);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
|
||||
struct task_struct *p)
|
||||
{
|
||||
@@ -3822,7 +3837,7 @@ done:
|
||||
|
||||
static inline int task_fits_capacity(struct task_struct *p, long capacity)
|
||||
{
|
||||
return fits_capacity(task_util_est(p), capacity);
|
||||
return fits_capacity(uclamp_task_util(p), capacity);
|
||||
}
|
||||
|
||||
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
|
||||
@@ -3857,7 +3872,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
|
||||
static inline void remove_entity_load_avg(struct sched_entity *se) {}
|
||||
|
||||
static inline void
|
||||
attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
|
||||
attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
|
||||
static inline void
|
||||
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
|
||||
|
||||
@@ -5196,6 +5211,20 @@ static inline void update_overutilized_status(struct rq *rq)
|
||||
static inline void update_overutilized_status(struct rq *rq) { }
|
||||
#endif
|
||||
|
||||
/* Runqueue only has SCHED_IDLE tasks enqueued */
|
||||
static int sched_idle_rq(struct rq *rq)
|
||||
{
|
||||
return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
|
||||
rq->nr_running);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static int sched_idle_cpu(int cpu)
|
||||
{
|
||||
return sched_idle_rq(cpu_rq(cpu));
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The enqueue_task method is called before nr_running is
|
||||
* increased. Here we update the fair scheduling stats and
|
||||
@@ -5310,6 +5339,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
struct sched_entity *se = &p->se;
|
||||
int task_sleep = flags & DEQUEUE_SLEEP;
|
||||
int idle_h_nr_running = task_has_idle_policy(p);
|
||||
bool was_sched_idle = sched_idle_rq(rq);
|
||||
|
||||
for_each_sched_entity(se) {
|
||||
cfs_rq = cfs_rq_of(se);
|
||||
@@ -5356,6 +5386,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
if (!se)
|
||||
sub_nr_running(rq, 1);
|
||||
|
||||
/* balance early to pull high priority tasks */
|
||||
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
|
||||
rq->next_balance = jiffies;
|
||||
|
||||
util_est_dequeue(&rq->cfs, p, task_sleep);
|
||||
hrtick_update(rq);
|
||||
}
|
||||
@@ -5378,15 +5412,6 @@ static struct {
|
||||
|
||||
#endif /* CONFIG_NO_HZ_COMMON */
|
||||
|
||||
/* CPU only has SCHED_IDLE tasks enqueued */
|
||||
static int sched_idle_cpu(int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
|
||||
rq->nr_running);
|
||||
}
|
||||
|
||||
static unsigned long cpu_load(struct rq *rq)
|
||||
{
|
||||
return cfs_rq_load_avg(&rq->cfs);
|
||||
@@ -5588,7 +5613,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
|
||||
unsigned int min_exit_latency = UINT_MAX;
|
||||
u64 latest_idle_timestamp = 0;
|
||||
int least_loaded_cpu = this_cpu;
|
||||
int shallowest_idle_cpu = -1, si_cpu = -1;
|
||||
int shallowest_idle_cpu = -1;
|
||||
int i;
|
||||
|
||||
/* Check if we have any choice: */
|
||||
@@ -5597,6 +5622,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
|
||||
|
||||
/* Traverse only the allowed CPUs */
|
||||
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
|
||||
if (sched_idle_cpu(i))
|
||||
return i;
|
||||
|
||||
if (available_idle_cpu(i)) {
|
||||
struct rq *rq = cpu_rq(i);
|
||||
struct cpuidle_state *idle = idle_get_state(rq);
|
||||
@@ -5619,12 +5647,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
|
||||
latest_idle_timestamp = rq->idle_stamp;
|
||||
shallowest_idle_cpu = i;
|
||||
}
|
||||
} else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
|
||||
if (sched_idle_cpu(i)) {
|
||||
si_cpu = i;
|
||||
continue;
|
||||
}
|
||||
|
||||
} else if (shallowest_idle_cpu == -1) {
|
||||
load = cpu_load(cpu_rq(i));
|
||||
if (load < min_load) {
|
||||
min_load = load;
|
||||
@@ -5633,11 +5656,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
|
||||
}
|
||||
}
|
||||
|
||||
if (shallowest_idle_cpu != -1)
|
||||
return shallowest_idle_cpu;
|
||||
if (si_cpu != -1)
|
||||
return si_cpu;
|
||||
return least_loaded_cpu;
|
||||
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
|
||||
}
|
||||
|
||||
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
|
||||
@@ -5790,7 +5809,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
|
||||
*/
|
||||
static int select_idle_smt(struct task_struct *p, int target)
|
||||
{
|
||||
int cpu, si_cpu = -1;
|
||||
int cpu;
|
||||
|
||||
if (!static_branch_likely(&sched_smt_present))
|
||||
return -1;
|
||||
@@ -5798,13 +5817,11 @@ static int select_idle_smt(struct task_struct *p, int target)
|
||||
for_each_cpu(cpu, cpu_smt_mask(target)) {
|
||||
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
continue;
|
||||
if (available_idle_cpu(cpu))
|
||||
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
|
||||
return cpu;
|
||||
if (si_cpu == -1 && sched_idle_cpu(cpu))
|
||||
si_cpu = cpu;
|
||||
}
|
||||
|
||||
return si_cpu;
|
||||
return -1;
|
||||
}
|
||||
|
||||
#else /* CONFIG_SCHED_SMT */
|
||||
@@ -5828,12 +5845,13 @@ static inline int select_idle_smt(struct task_struct *p, int target)
|
||||
*/
|
||||
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
{
|
||||
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
|
||||
struct sched_domain *this_sd;
|
||||
u64 avg_cost, avg_idle;
|
||||
u64 time, cost;
|
||||
s64 delta;
|
||||
int this = smp_processor_id();
|
||||
int cpu, nr = INT_MAX, si_cpu = -1;
|
||||
int cpu, nr = INT_MAX;
|
||||
|
||||
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
|
||||
if (!this_sd)
|
||||
@@ -5859,15 +5877,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
|
||||
|
||||
time = cpu_clock(this);
|
||||
|
||||
for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
|
||||
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
|
||||
|
||||
for_each_cpu_wrap(cpu, cpus, target) {
|
||||
if (!--nr)
|
||||
return si_cpu;
|
||||
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
continue;
|
||||
if (available_idle_cpu(cpu))
|
||||
return -1;
|
||||
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
|
||||
break;
|
||||
if (si_cpu == -1 && sched_idle_cpu(cpu))
|
||||
si_cpu = cpu;
|
||||
}
|
||||
|
||||
time = cpu_clock(this) - time;
|
||||
@@ -6268,9 +6284,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
continue;
|
||||
|
||||
/* Skip CPUs that will be overutilized. */
|
||||
util = cpu_util_next(cpu, p, cpu);
|
||||
cpu_cap = capacity_of(cpu);
|
||||
spare_cap = cpu_cap - util;
|
||||
|
||||
/*
|
||||
* Skip CPUs that cannot satisfy the capacity request.
|
||||
* IOW, placing the task there would make the CPU
|
||||
* overutilized. Take uclamp into account to see how
|
||||
* much capacity we can get out of the CPU; this is
|
||||
* aligned with schedutil_cpu_util().
|
||||
*/
|
||||
util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
|
||||
if (!fits_capacity(util, cpu_cap))
|
||||
continue;
|
||||
|
||||
@@ -6285,7 +6310,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
* Find the CPU with the maximum spare capacity in
|
||||
* the performance domain
|
||||
*/
|
||||
spare_cap = cpu_cap - util;
|
||||
if (spare_cap > max_spare_cap) {
|
||||
max_spare_cap = spare_cap;
|
||||
max_spare_cap_cpu = cpu;
|
||||
@@ -7780,29 +7804,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
||||
*/
|
||||
|
||||
for_each_cpu(cpu, sched_group_span(sdg)) {
|
||||
struct sched_group_capacity *sgc;
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
unsigned long cpu_cap = capacity_of(cpu);
|
||||
|
||||
/*
|
||||
* build_sched_domains() -> init_sched_groups_capacity()
|
||||
* gets here before we've attached the domains to the
|
||||
* runqueues.
|
||||
*
|
||||
* Use capacity_of(), which is set irrespective of domains
|
||||
* in update_cpu_capacity().
|
||||
*
|
||||
* This avoids capacity from being 0 and
|
||||
* causing divide-by-zero issues on boot.
|
||||
*/
|
||||
if (unlikely(!rq->sd)) {
|
||||
capacity += capacity_of(cpu);
|
||||
} else {
|
||||
sgc = rq->sd->groups->sgc;
|
||||
capacity += sgc->capacity;
|
||||
}
|
||||
|
||||
min_capacity = min(capacity, min_capacity);
|
||||
max_capacity = max(capacity, max_capacity);
|
||||
capacity += cpu_cap;
|
||||
min_capacity = min(cpu_cap, min_capacity);
|
||||
max_capacity = max(cpu_cap, max_capacity);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
@@ -8168,14 +8174,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
||||
|
||||
case group_has_spare:
|
||||
/*
|
||||
* Select not overloaded group with lowest number of
|
||||
* idle cpus. We could also compare the spare capacity
|
||||
* which is more stable but it can end up that the
|
||||
* group has less spare capacity but finally more idle
|
||||
* Select not overloaded group with lowest number of idle cpus
|
||||
* and highest number of running tasks. We could also compare
|
||||
* the spare capacity which is more stable but it can end up
|
||||
* that the group has less spare capacity but finally more idle
|
||||
* CPUs which means less opportunity to pull tasks.
|
||||
*/
|
||||
if (sgs->idle_cpus >= busiest->idle_cpus)
|
||||
if (sgs->idle_cpus > busiest->idle_cpus)
|
||||
return false;
|
||||
else if ((sgs->idle_cpus == busiest->idle_cpus) &&
|
||||
(sgs->sum_nr_running <= busiest->sum_nr_running))
|
||||
return false;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -9529,6 +9539,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
{
|
||||
int continue_balancing = 1;
|
||||
int cpu = rq->cpu;
|
||||
int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
|
||||
unsigned long interval;
|
||||
struct sched_domain *sd;
|
||||
/* Earliest time when we have to do rebalance again */
|
||||
@@ -9565,7 +9576,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
break;
|
||||
}
|
||||
|
||||
interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
|
||||
interval = get_sd_balance_interval(sd, busy);
|
||||
|
||||
need_serialize = sd->flags & SD_SERIALIZE;
|
||||
if (need_serialize) {
|
||||
@@ -9581,9 +9592,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
* state even if we migrated tasks. Update it.
|
||||
*/
|
||||
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
|
||||
busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
|
||||
}
|
||||
sd->last_balance = jiffies;
|
||||
interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
|
||||
interval = get_sd_balance_interval(sd, busy);
|
||||
}
|
||||
if (need_serialize)
|
||||
spin_unlock(&balancing);
|
||||
@@ -10333,6 +10345,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
if (!task_on_rq_queued(p))
|
||||
return;
|
||||
|
||||
if (rq->cfs.nr_running == 1)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Reschedule if we are currently running on this runqueue and
|
||||
* our priority decreased, or if we are not currently running on
|
||||
@@ -10423,7 +10438,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
|
||||
|
||||
/* Synchronize entity with its cfs_rq */
|
||||
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
|
||||
attach_entity_load_avg(cfs_rq, se, 0);
|
||||
attach_entity_load_avg(cfs_rq, se);
|
||||
update_tg_load_avg(cfs_rq, false);
|
||||
propagate_entity_cfs_rq(se);
|
||||
}
|
||||
|
||||
@@ -158,7 +158,7 @@ static void cpuidle_idle_call(void)
|
||||
/*
|
||||
* Suspend-to-idle ("s2idle") is a system state in which all user space
|
||||
* has been frozen, all I/O devices have been suspended and the only
|
||||
* activity happens here and in iterrupts (if any). In that case bypass
|
||||
* activity happens here and in interrupts (if any). In that case bypass
|
||||
* the cpuidle governor and go stratight for the deepest idle state
|
||||
* available. Possibly also suspend the local tick and the entire
|
||||
* timekeeping to prevent timer interrupts from kicking us out of idle
|
||||
|
||||
@@ -129,8 +129,20 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
|
||||
* Step 2
|
||||
*/
|
||||
delta %= 1024;
|
||||
contrib = __accumulate_pelt_segments(periods,
|
||||
1024 - sa->period_contrib, delta);
|
||||
if (load) {
|
||||
/*
|
||||
* This relies on the:
|
||||
*
|
||||
* if (!load)
|
||||
* runnable = running = 0;
|
||||
*
|
||||
* clause from ___update_load_sum(); this results in
|
||||
* the below usage of @contrib to dissapear entirely,
|
||||
* so no point in calculating it.
|
||||
*/
|
||||
contrib = __accumulate_pelt_segments(periods,
|
||||
1024 - sa->period_contrib, delta);
|
||||
}
|
||||
}
|
||||
sa->period_contrib = delta;
|
||||
|
||||
@@ -205,7 +217,9 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
|
||||
* This means that weight will be 0 but not running for a sched_entity
|
||||
* but also for a cfs_rq if the latter becomes idle. As an example,
|
||||
* this happens during idle_balance() which calls
|
||||
* update_blocked_averages()
|
||||
* update_blocked_averages().
|
||||
*
|
||||
* Also see the comment in accumulate_sum().
|
||||
*/
|
||||
if (!load)
|
||||
runnable = running = 0;
|
||||
|
||||
@@ -1280,10 +1280,12 @@ static const struct file_operations psi_cpu_fops = {
|
||||
|
||||
static int __init psi_proc_init(void)
|
||||
{
|
||||
proc_mkdir("pressure", NULL);
|
||||
proc_create("pressure/io", 0, NULL, &psi_io_fops);
|
||||
proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
|
||||
proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
|
||||
if (psi_enable) {
|
||||
proc_mkdir("pressure", NULL);
|
||||
proc_create("pressure/io", 0, NULL, &psi_io_fops);
|
||||
proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
|
||||
proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
module_init(psi_proc_init);
|
||||
|
||||
@@ -437,6 +437,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
|
||||
return rt_se->on_rq;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
/*
|
||||
* Verify the fitness of task @p to run on @cpu taking into account the uclamp
|
||||
* settings.
|
||||
*
|
||||
* This check is only important for heterogeneous systems where uclamp_min value
|
||||
* is higher than the capacity of a @cpu. For non-heterogeneous system this
|
||||
* function will always return true.
|
||||
*
|
||||
* The function will return true if the capacity of the @cpu is >= the
|
||||
* uclamp_min and false otherwise.
|
||||
*
|
||||
* Note that uclamp_min will be clamped to uclamp_max if uclamp_min
|
||||
* > uclamp_max.
|
||||
*/
|
||||
static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
|
||||
{
|
||||
unsigned int min_cap;
|
||||
unsigned int max_cap;
|
||||
unsigned int cpu_cap;
|
||||
|
||||
/* Only heterogeneous systems can benefit from this check */
|
||||
if (!static_branch_unlikely(&sched_asym_cpucapacity))
|
||||
return true;
|
||||
|
||||
min_cap = uclamp_eff_value(p, UCLAMP_MIN);
|
||||
max_cap = uclamp_eff_value(p, UCLAMP_MAX);
|
||||
|
||||
cpu_cap = capacity_orig_of(cpu);
|
||||
|
||||
return cpu_cap >= min(min_cap, max_cap);
|
||||
}
|
||||
#else
|
||||
static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
|
||||
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
|
||||
@@ -1391,6 +1430,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
{
|
||||
struct task_struct *curr;
|
||||
struct rq *rq;
|
||||
bool test;
|
||||
|
||||
/* For anything but wake ups, just return the task_cpu */
|
||||
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
|
||||
@@ -1422,10 +1462,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
*
|
||||
* This test is optimistic, if we get it wrong the load-balancer
|
||||
* will have to sort it out.
|
||||
*
|
||||
* We take into account the capacity of the CPU to ensure it fits the
|
||||
* requirement of the task - which is only important on heterogeneous
|
||||
* systems like big.LITTLE.
|
||||
*/
|
||||
if (curr && unlikely(rt_task(curr)) &&
|
||||
(curr->nr_cpus_allowed < 2 ||
|
||||
curr->prio <= p->prio)) {
|
||||
test = curr &&
|
||||
unlikely(rt_task(curr)) &&
|
||||
(curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
|
||||
|
||||
if (test || !rt_task_fits_capacity(p, cpu)) {
|
||||
int target = find_lowest_rq(p);
|
||||
|
||||
/*
|
||||
@@ -1449,15 +1495,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
|
||||
* let's hope p can move out.
|
||||
*/
|
||||
if (rq->curr->nr_cpus_allowed == 1 ||
|
||||
!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
|
||||
!cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL))
|
||||
return;
|
||||
|
||||
/*
|
||||
* p is migratable, so let's not schedule it and
|
||||
* see if it is pushed or pulled somewhere else.
|
||||
*/
|
||||
if (p->nr_cpus_allowed != 1
|
||||
&& cpupri_find(&rq->rd->cpupri, p, NULL))
|
||||
if (p->nr_cpus_allowed != 1 &&
|
||||
cpupri_find(&rq->rd->cpupri, p, NULL, NULL))
|
||||
return;
|
||||
|
||||
/*
|
||||
@@ -1601,7 +1647,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
|
||||
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
|
||||
{
|
||||
if (!task_running(rq, p) &&
|
||||
cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
cpumask_test_cpu(cpu, p->cpus_ptr) &&
|
||||
rt_task_fits_capacity(p, cpu))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
@@ -1643,7 +1690,8 @@ static int find_lowest_rq(struct task_struct *task)
|
||||
if (task->nr_cpus_allowed == 1)
|
||||
return -1; /* No other targets possible */
|
||||
|
||||
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
|
||||
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask,
|
||||
rt_task_fits_capacity))
|
||||
return -1; /* No targets found */
|
||||
|
||||
/*
|
||||
@@ -2147,12 +2195,14 @@ skip:
|
||||
*/
|
||||
static void task_woken_rt(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
if (!task_running(rq, p) &&
|
||||
!test_tsk_need_resched(rq->curr) &&
|
||||
p->nr_cpus_allowed > 1 &&
|
||||
(dl_task(rq->curr) || rt_task(rq->curr)) &&
|
||||
(rq->curr->nr_cpus_allowed < 2 ||
|
||||
rq->curr->prio <= p->prio))
|
||||
bool need_to_push = !task_running(rq, p) &&
|
||||
!test_tsk_need_resched(rq->curr) &&
|
||||
p->nr_cpus_allowed > 1 &&
|
||||
(dl_task(rq->curr) || rt_task(rq->curr)) &&
|
||||
(rq->curr->nr_cpus_allowed < 2 ||
|
||||
rq->curr->prio <= p->prio);
|
||||
|
||||
if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq)))
|
||||
push_rt_tasks(rq);
|
||||
}
|
||||
|
||||
@@ -2224,7 +2274,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
|
||||
*/
|
||||
if (task_on_rq_queued(p) && rq->curr != p) {
|
||||
#ifdef CONFIG_SMP
|
||||
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
|
||||
bool need_to_push = rq->rt.overloaded ||
|
||||
!rt_task_fits_capacity(p, cpu_of(rq));
|
||||
|
||||
if (p->nr_cpus_allowed > 1 && need_to_push)
|
||||
rt_queue_push_tasks(rq);
|
||||
#endif /* CONFIG_SMP */
|
||||
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
|
||||
|
||||
@@ -2300,14 +2300,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
|
||||
#endif /* CONFIG_CPU_FREQ */
|
||||
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
|
||||
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
|
||||
|
||||
static __always_inline
|
||||
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
|
||||
struct task_struct *p)
|
||||
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
|
||||
struct task_struct *p)
|
||||
{
|
||||
unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
|
||||
unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
|
||||
unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
|
||||
unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
|
||||
|
||||
if (p) {
|
||||
min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
|
||||
@@ -2324,18 +2324,10 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
|
||||
|
||||
return clamp(util, min_util, max_util);
|
||||
}
|
||||
|
||||
static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
|
||||
{
|
||||
return uclamp_util_with(rq, util, NULL);
|
||||
}
|
||||
#else /* CONFIG_UCLAMP_TASK */
|
||||
static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
|
||||
struct task_struct *p)
|
||||
{
|
||||
return util;
|
||||
}
|
||||
static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
|
||||
static inline
|
||||
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
|
||||
struct task_struct *p)
|
||||
{
|
||||
return util;
|
||||
}
|
||||
|
||||
@@ -1879,6 +1879,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
|
||||
return sd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
|
||||
* any two given CPUs at this (non-NUMA) topology level.
|
||||
*/
|
||||
static bool topology_span_sane(struct sched_domain_topology_level *tl,
|
||||
const struct cpumask *cpu_map, int cpu)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* NUMA levels are allowed to overlap */
|
||||
if (tl->flags & SDTL_OVERLAP)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Non-NUMA levels cannot partially overlap - they must be either
|
||||
* completely equal or completely disjoint. Otherwise we can end up
|
||||
* breaking the sched_group lists - i.e. a later get_group() pass
|
||||
* breaks the linking done for an earlier span.
|
||||
*/
|
||||
for_each_cpu(i, cpu_map) {
|
||||
if (i == cpu)
|
||||
continue;
|
||||
/*
|
||||
* We should 'and' all those masks with 'cpu_map' to exactly
|
||||
* match the topology we're about to build, but that can only
|
||||
* remove CPUs, which only lessens our ability to detect
|
||||
* overlaps
|
||||
*/
|
||||
if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
|
||||
cpumask_intersects(tl->mask(cpu), tl->mask(i)))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the sched_domain_topology_level where all CPU capacities are visible
|
||||
* for all CPUs.
|
||||
@@ -1975,6 +2011,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
||||
has_asym = true;
|
||||
}
|
||||
|
||||
if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
|
||||
goto error;
|
||||
|
||||
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
|
||||
|
||||
if (tl == sched_domain_topology)
|
||||
|
||||
@@ -179,6 +179,7 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int
|
||||
.bit_nr = -1,
|
||||
},
|
||||
.wq_entry = {
|
||||
.flags = flags,
|
||||
.private = current,
|
||||
.func = var_wake_function,
|
||||
.entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
|
||||
|
||||
@@ -442,7 +442,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
|
||||
* @cpumask were offline; otherwise, 0 if all executions of @fn
|
||||
* returned 0, any non zero return value if any returned non zero.
|
||||
*/
|
||||
int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
|
||||
static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@@ -453,36 +453,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* try_stop_cpus - try to stop multiple cpus
|
||||
* @cpumask: cpus to stop
|
||||
* @fn: function to execute
|
||||
* @arg: argument to @fn
|
||||
*
|
||||
* Identical to stop_cpus() except that it fails with -EAGAIN if
|
||||
* someone else is already using the facility.
|
||||
*
|
||||
* CONTEXT:
|
||||
* Might sleep.
|
||||
*
|
||||
* RETURNS:
|
||||
* -EAGAIN if someone else is already stopping cpus, -ENOENT if
|
||||
* @fn(@arg) was not executed at all because all cpus in @cpumask were
|
||||
* offline; otherwise, 0 if all executions of @fn returned 0, any non
|
||||
* zero return value if any returned non zero.
|
||||
*/
|
||||
int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* static works are used, process one request at a time */
|
||||
if (!mutex_trylock(&stop_cpus_mutex))
|
||||
return -EAGAIN;
|
||||
ret = __stop_cpus(cpumask, fn, arg);
|
||||
mutex_unlock(&stop_cpus_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cpu_stop_should_run(unsigned int cpu)
|
||||
{
|
||||
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
|
||||
|
||||
@@ -1268,7 +1268,7 @@ static struct ctl_table kern_table[] = {
|
||||
.proc_handler = proc_do_static_key,
|
||||
},
|
||||
#endif
|
||||
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
|
||||
#if defined(CONFIG_TREE_RCU)
|
||||
{
|
||||
.procname = "panic_on_rcu_stall",
|
||||
.data = &sysctl_panic_on_rcu_stall,
|
||||
|
||||
@@ -703,6 +703,7 @@ struct send_signal_irq_work {
|
||||
struct irq_work irq_work;
|
||||
struct task_struct *task;
|
||||
u32 sig;
|
||||
enum pid_type type;
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
|
||||
@@ -712,10 +713,10 @@ static void do_bpf_send_signal(struct irq_work *entry)
|
||||
struct send_signal_irq_work *work;
|
||||
|
||||
work = container_of(entry, struct send_signal_irq_work, irq_work);
|
||||
group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
|
||||
group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
|
||||
}
|
||||
|
||||
BPF_CALL_1(bpf_send_signal, u32, sig)
|
||||
static int bpf_send_signal_common(u32 sig, enum pid_type type)
|
||||
{
|
||||
struct send_signal_irq_work *work = NULL;
|
||||
|
||||
@@ -748,11 +749,17 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
|
||||
*/
|
||||
work->task = current;
|
||||
work->sig = sig;
|
||||
work->type = type;
|
||||
irq_work_queue(&work->irq_work);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
|
||||
return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
|
||||
}
|
||||
|
||||
BPF_CALL_1(bpf_send_signal, u32, sig)
|
||||
{
|
||||
return bpf_send_signal_common(sig, PIDTYPE_TGID);
|
||||
}
|
||||
|
||||
static const struct bpf_func_proto bpf_send_signal_proto = {
|
||||
@@ -762,6 +769,18 @@ static const struct bpf_func_proto bpf_send_signal_proto = {
|
||||
.arg1_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
BPF_CALL_1(bpf_send_signal_thread, u32, sig)
|
||||
{
|
||||
return bpf_send_signal_common(sig, PIDTYPE_PID);
|
||||
}
|
||||
|
||||
static const struct bpf_func_proto bpf_send_signal_thread_proto = {
|
||||
.func = bpf_send_signal_thread,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
static const struct bpf_func_proto *
|
||||
tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
{
|
||||
@@ -822,6 +841,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
#endif
|
||||
case BPF_FUNC_send_signal:
|
||||
return &bpf_send_signal_proto;
|
||||
case BPF_FUNC_send_signal_thread:
|
||||
return &bpf_send_signal_thread_proto;
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -52,6 +52,9 @@ enum trace_type {
|
||||
#undef __field
|
||||
#define __field(type, item) type item;
|
||||
|
||||
#undef __field_fn
|
||||
#define __field_fn(type, item) type item;
|
||||
|
||||
#undef __field_struct
|
||||
#define __field_struct(type, item) __field(type, item)
|
||||
|
||||
@@ -71,26 +74,22 @@ enum trace_type {
|
||||
#define F_STRUCT(args...) args
|
||||
|
||||
#undef FTRACE_ENTRY
|
||||
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
|
||||
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
|
||||
struct struct_name { \
|
||||
struct trace_entry ent; \
|
||||
tstruct \
|
||||
}
|
||||
|
||||
#undef FTRACE_ENTRY_DUP
|
||||
#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
|
||||
#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
|
||||
|
||||
#undef FTRACE_ENTRY_REG
|
||||
#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
|
||||
filter, regfn) \
|
||||
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
|
||||
filter)
|
||||
#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \
|
||||
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
|
||||
|
||||
#undef FTRACE_ENTRY_PACKED
|
||||
#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \
|
||||
filter) \
|
||||
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
|
||||
filter) __packed
|
||||
#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print) \
|
||||
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed
|
||||
|
||||
#include "trace_entries.h"
|
||||
|
||||
@@ -1917,17 +1916,15 @@ extern void tracing_log_err(struct trace_array *tr,
|
||||
#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
|
||||
|
||||
#undef FTRACE_ENTRY
|
||||
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
|
||||
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
|
||||
extern struct trace_event_call \
|
||||
__aligned(4) event_##call;
|
||||
#undef FTRACE_ENTRY_DUP
|
||||
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
|
||||
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
|
||||
filter)
|
||||
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
|
||||
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
|
||||
#undef FTRACE_ENTRY_PACKED
|
||||
#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
|
||||
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
|
||||
filter)
|
||||
#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \
|
||||
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
|
||||
|
||||
#include "trace_entries.h"
|
||||
|
||||
|
||||
@@ -61,15 +61,13 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
|
||||
TRACE_FN,
|
||||
|
||||
F_STRUCT(
|
||||
__field( unsigned long, ip )
|
||||
__field( unsigned long, parent_ip )
|
||||
__field_fn( unsigned long, ip )
|
||||
__field_fn( unsigned long, parent_ip )
|
||||
),
|
||||
|
||||
F_printk(" %ps <-- %ps",
|
||||
(void *)__entry->ip, (void *)__entry->parent_ip),
|
||||
|
||||
FILTER_TRACE_FN,
|
||||
|
||||
perf_ftrace_event_register
|
||||
);
|
||||
|
||||
@@ -84,9 +82,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
|
||||
__field_desc( int, graph_ent, depth )
|
||||
),
|
||||
|
||||
F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth),
|
||||
|
||||
FILTER_OTHER
|
||||
F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
|
||||
);
|
||||
|
||||
/* Function return entry */
|
||||
@@ -97,18 +93,16 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
|
||||
F_STRUCT(
|
||||
__field_struct( struct ftrace_graph_ret, ret )
|
||||
__field_desc( unsigned long, ret, func )
|
||||
__field_desc( unsigned long, ret, overrun )
|
||||
__field_desc( unsigned long long, ret, calltime)
|
||||
__field_desc( unsigned long long, ret, rettime )
|
||||
__field_desc( unsigned long, ret, overrun )
|
||||
__field_desc( int, ret, depth )
|
||||
),
|
||||
|
||||
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
|
||||
(void *)__entry->func, __entry->depth,
|
||||
__entry->calltime, __entry->rettime,
|
||||
__entry->depth),
|
||||
|
||||
FILTER_OTHER
|
||||
__entry->depth)
|
||||
);
|
||||
|
||||
/*
|
||||
@@ -137,9 +131,7 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
|
||||
F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
|
||||
__entry->prev_pid, __entry->prev_prio, __entry->prev_state,
|
||||
__entry->next_pid, __entry->next_prio, __entry->next_state,
|
||||
__entry->next_cpu),
|
||||
|
||||
FILTER_OTHER
|
||||
__entry->next_cpu)
|
||||
);
|
||||
|
||||
/*
|
||||
@@ -157,9 +149,7 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
|
||||
F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
|
||||
__entry->prev_pid, __entry->prev_prio, __entry->prev_state,
|
||||
__entry->next_pid, __entry->next_prio, __entry->next_state,
|
||||
__entry->next_cpu),
|
||||
|
||||
FILTER_OTHER
|
||||
__entry->next_cpu)
|
||||
);
|
||||
|
||||
/*
|
||||
@@ -183,9 +173,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
|
||||
(void *)__entry->caller[0], (void *)__entry->caller[1],
|
||||
(void *)__entry->caller[2], (void *)__entry->caller[3],
|
||||
(void *)__entry->caller[4], (void *)__entry->caller[5],
|
||||
(void *)__entry->caller[6], (void *)__entry->caller[7]),
|
||||
|
||||
FILTER_OTHER
|
||||
(void *)__entry->caller[6], (void *)__entry->caller[7])
|
||||
);
|
||||
|
||||
FTRACE_ENTRY(user_stack, userstack_entry,
|
||||
@@ -203,9 +191,7 @@ FTRACE_ENTRY(user_stack, userstack_entry,
|
||||
(void *)__entry->caller[0], (void *)__entry->caller[1],
|
||||
(void *)__entry->caller[2], (void *)__entry->caller[3],
|
||||
(void *)__entry->caller[4], (void *)__entry->caller[5],
|
||||
(void *)__entry->caller[6], (void *)__entry->caller[7]),
|
||||
|
||||
FILTER_OTHER
|
||||
(void *)__entry->caller[6], (void *)__entry->caller[7])
|
||||
);
|
||||
|
||||
/*
|
||||
@@ -222,9 +208,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
|
||||
),
|
||||
|
||||
F_printk("%ps: %s",
|
||||
(void *)__entry->ip, __entry->fmt),
|
||||
|
||||
FILTER_OTHER
|
||||
(void *)__entry->ip, __entry->fmt)
|
||||
);
|
||||
|
||||
FTRACE_ENTRY_REG(print, print_entry,
|
||||
@@ -239,8 +223,6 @@ FTRACE_ENTRY_REG(print, print_entry,
|
||||
F_printk("%ps: %s",
|
||||
(void *)__entry->ip, __entry->buf),
|
||||
|
||||
FILTER_OTHER,
|
||||
|
||||
ftrace_event_register
|
||||
);
|
||||
|
||||
@@ -254,9 +236,7 @@ FTRACE_ENTRY(raw_data, raw_data_entry,
|
||||
),
|
||||
|
||||
F_printk("id:%04x %08x",
|
||||
__entry->id, (int)__entry->buf[0]),
|
||||
|
||||
FILTER_OTHER
|
||||
__entry->id, (int)__entry->buf[0])
|
||||
);
|
||||
|
||||
FTRACE_ENTRY(bputs, bputs_entry,
|
||||
@@ -269,9 +249,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
|
||||
),
|
||||
|
||||
F_printk("%ps: %s",
|
||||
(void *)__entry->ip, __entry->str),
|
||||
|
||||
FILTER_OTHER
|
||||
(void *)__entry->ip, __entry->str)
|
||||
);
|
||||
|
||||
FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
|
||||
@@ -283,16 +261,14 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
|
||||
__field_desc( resource_size_t, rw, phys )
|
||||
__field_desc( unsigned long, rw, value )
|
||||
__field_desc( unsigned long, rw, pc )
|
||||
__field_desc( int, rw, map_id )
|
||||
__field_desc( int, rw, map_id )
|
||||
__field_desc( unsigned char, rw, opcode )
|
||||
__field_desc( unsigned char, rw, width )
|
||||
),
|
||||
|
||||
F_printk("%lx %lx %lx %d %x %x",
|
||||
(unsigned long)__entry->phys, __entry->value, __entry->pc,
|
||||
__entry->map_id, __entry->opcode, __entry->width),
|
||||
|
||||
FILTER_OTHER
|
||||
__entry->map_id, __entry->opcode, __entry->width)
|
||||
);
|
||||
|
||||
FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
|
||||
@@ -304,15 +280,13 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
|
||||
__field_desc( resource_size_t, map, phys )
|
||||
__field_desc( unsigned long, map, virt )
|
||||
__field_desc( unsigned long, map, len )
|
||||
__field_desc( int, map, map_id )
|
||||
__field_desc( int, map, map_id )
|
||||
__field_desc( unsigned char, map, opcode )
|
||||
),
|
||||
|
||||
F_printk("%lx %lx %lx %d %x",
|
||||
(unsigned long)__entry->phys, __entry->virt, __entry->len,
|
||||
__entry->map_id, __entry->opcode),
|
||||
|
||||
FILTER_OTHER
|
||||
__entry->map_id, __entry->opcode)
|
||||
);
|
||||
|
||||
|
||||
@@ -334,9 +308,7 @@ FTRACE_ENTRY(branch, trace_branch,
|
||||
F_printk("%u:%s:%s (%u)%s",
|
||||
__entry->line,
|
||||
__entry->func, __entry->file, __entry->correct,
|
||||
__entry->constant ? " CONSTANT" : ""),
|
||||
|
||||
FILTER_OTHER
|
||||
__entry->constant ? " CONSTANT" : "")
|
||||
);
|
||||
|
||||
|
||||
@@ -362,7 +334,5 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
|
||||
__entry->duration,
|
||||
__entry->outer_duration,
|
||||
__entry->nmi_total_ts,
|
||||
__entry->nmi_count),
|
||||
|
||||
FILTER_OTHER
|
||||
__entry->nmi_count)
|
||||
);
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include <linux/delay.h>
|
||||
|
||||
#include <trace/events/sched.h>
|
||||
#include <trace/syscall.h>
|
||||
|
||||
#include <asm/setup.h>
|
||||
|
||||
@@ -2017,7 +2018,24 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
|
||||
*/
|
||||
head = trace_get_fields(call);
|
||||
if (list_empty(head)) {
|
||||
ret = call->class->define_fields(call);
|
||||
struct trace_event_fields *field = call->class->fields_array;
|
||||
unsigned int offset = sizeof(struct trace_entry);
|
||||
|
||||
for (; field->type; field++) {
|
||||
if (field->type == TRACE_FUNCTION_TYPE) {
|
||||
ret = field->define_fields(call);
|
||||
break;
|
||||
}
|
||||
|
||||
offset = ALIGN(offset, field->align);
|
||||
ret = trace_define_field(call, field->type, field->name,
|
||||
offset, field->size,
|
||||
field->is_signed, field->filter_type);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
offset += field->size;
|
||||
}
|
||||
if (ret < 0) {
|
||||
pr_warn("Could not initialize trace point events/%s\n",
|
||||
name);
|
||||
|
||||
@@ -1155,6 +1155,12 @@ static struct synth_event *find_synth_event(const char *name)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct trace_event_fields synth_event_fields_array[] = {
|
||||
{ .type = TRACE_FUNCTION_TYPE,
|
||||
.define_fields = synth_event_define_fields },
|
||||
{}
|
||||
};
|
||||
|
||||
static int register_synth_event(struct synth_event *event)
|
||||
{
|
||||
struct trace_event_call *call = &event->call;
|
||||
@@ -1176,7 +1182,7 @@ static int register_synth_event(struct synth_event *event)
|
||||
|
||||
INIT_LIST_HEAD(&call->class->fields);
|
||||
call->event.funcs = &synth_event_funcs;
|
||||
call->class->define_fields = synth_event_define_fields;
|
||||
call->class->fields_array = synth_event_fields_array;
|
||||
|
||||
ret = register_trace_event(&call->event);
|
||||
if (!ret) {
|
||||
|
||||
@@ -29,10 +29,8 @@ static int ftrace_event_register(struct trace_event_call *call,
|
||||
* function and thus become accesible via perf.
|
||||
*/
|
||||
#undef FTRACE_ENTRY_REG
|
||||
#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
|
||||
filter, regfn) \
|
||||
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
|
||||
filter)
|
||||
#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \
|
||||
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
|
||||
|
||||
/* not needed for this file */
|
||||
#undef __field_struct
|
||||
@@ -41,6 +39,9 @@ static int ftrace_event_register(struct trace_event_call *call,
|
||||
#undef __field
|
||||
#define __field(type, item) type item;
|
||||
|
||||
#undef __field_fn
|
||||
#define __field_fn(type, item) type item;
|
||||
|
||||
#undef __field_desc
|
||||
#define __field_desc(type, container, item) type item;
|
||||
|
||||
@@ -60,7 +61,7 @@ static int ftrace_event_register(struct trace_event_call *call,
|
||||
#define F_printk(fmt, args...) fmt, args
|
||||
|
||||
#undef FTRACE_ENTRY
|
||||
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
|
||||
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
|
||||
struct ____ftrace_##name { \
|
||||
tstruct \
|
||||
}; \
|
||||
@@ -73,76 +74,46 @@ static void __always_unused ____ftrace_check_##name(void) \
|
||||
}
|
||||
|
||||
#undef FTRACE_ENTRY_DUP
|
||||
#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \
|
||||
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
|
||||
filter)
|
||||
#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
|
||||
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
|
||||
|
||||
#include "trace_entries.h"
|
||||
|
||||
#undef __field_ext
|
||||
#define __field_ext(_type, _item, _filter_type) { \
|
||||
.type = #_type, .name = #_item, \
|
||||
.size = sizeof(_type), .align = __alignof__(_type), \
|
||||
is_signed_type(_type), .filter_type = _filter_type },
|
||||
|
||||
#undef __field
|
||||
#define __field(type, item) \
|
||||
ret = trace_define_field(event_call, #type, #item, \
|
||||
offsetof(typeof(field), item), \
|
||||
sizeof(field.item), \
|
||||
is_signed_type(type), filter_type); \
|
||||
if (ret) \
|
||||
return ret;
|
||||
#define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER)
|
||||
|
||||
#undef __field_fn
|
||||
#define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN)
|
||||
|
||||
#undef __field_desc
|
||||
#define __field_desc(type, container, item) \
|
||||
ret = trace_define_field(event_call, #type, #item, \
|
||||
offsetof(typeof(field), \
|
||||
container.item), \
|
||||
sizeof(field.container.item), \
|
||||
is_signed_type(type), filter_type); \
|
||||
if (ret) \
|
||||
return ret;
|
||||
#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER)
|
||||
|
||||
#undef __array
|
||||
#define __array(type, item, len) \
|
||||
do { \
|
||||
char *type_str = #type"["__stringify(len)"]"; \
|
||||
BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
|
||||
ret = trace_define_field(event_call, type_str, #item, \
|
||||
offsetof(typeof(field), item), \
|
||||
sizeof(field.item), \
|
||||
is_signed_type(type), filter_type); \
|
||||
if (ret) \
|
||||
return ret; \
|
||||
} while (0);
|
||||
#define __array(_type, _item, _len) { \
|
||||
.type = #_type"["__stringify(_len)"]", .name = #_item, \
|
||||
.size = sizeof(_type[_len]), .align = __alignof__(_type), \
|
||||
is_signed_type(_type), .filter_type = FILTER_OTHER },
|
||||
|
||||
#undef __array_desc
|
||||
#define __array_desc(type, container, item, len) \
|
||||
BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
|
||||
ret = trace_define_field(event_call, #type "[" #len "]", #item, \
|
||||
offsetof(typeof(field), \
|
||||
container.item), \
|
||||
sizeof(field.container.item), \
|
||||
is_signed_type(type), filter_type); \
|
||||
if (ret) \
|
||||
return ret;
|
||||
#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len)
|
||||
|
||||
#undef __dynamic_array
|
||||
#define __dynamic_array(type, item) \
|
||||
ret = trace_define_field(event_call, #type "[]", #item, \
|
||||
offsetof(typeof(field), item), \
|
||||
0, is_signed_type(type), filter_type);\
|
||||
if (ret) \
|
||||
return ret;
|
||||
#define __dynamic_array(_type, _item) { \
|
||||
.type = #_type "[]", .name = #_item, \
|
||||
.size = 0, .align = __alignof__(_type), \
|
||||
is_signed_type(_type), .filter_type = FILTER_OTHER },
|
||||
|
||||
#undef FTRACE_ENTRY
|
||||
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
|
||||
static int __init \
|
||||
ftrace_define_fields_##name(struct trace_event_call *event_call) \
|
||||
{ \
|
||||
struct struct_name field; \
|
||||
int ret; \
|
||||
int filter_type = filter; \
|
||||
\
|
||||
tstruct; \
|
||||
\
|
||||
return ret; \
|
||||
}
|
||||
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
|
||||
static struct trace_event_fields ftrace_event_fields_##name[] = { \
|
||||
tstruct \
|
||||
{} };
|
||||
|
||||
#include "trace_entries.h"
|
||||
|
||||
@@ -152,6 +123,9 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \
|
||||
#undef __field
|
||||
#define __field(type, item)
|
||||
|
||||
#undef __field_fn
|
||||
#define __field_fn(type, item)
|
||||
|
||||
#undef __field_desc
|
||||
#define __field_desc(type, container, item)
|
||||
|
||||
@@ -168,12 +142,10 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \
|
||||
#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args)
|
||||
|
||||
#undef FTRACE_ENTRY_REG
|
||||
#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
|
||||
regfn) \
|
||||
\
|
||||
#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn) \
|
||||
static struct trace_event_class __refdata event_class_ftrace_##call = { \
|
||||
.system = __stringify(TRACE_SYSTEM), \
|
||||
.define_fields = ftrace_define_fields_##call, \
|
||||
.fields_array = ftrace_event_fields_##call, \
|
||||
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
|
||||
.reg = regfn, \
|
||||
}; \
|
||||
@@ -191,9 +163,9 @@ static struct trace_event_call __used \
|
||||
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
|
||||
|
||||
#undef FTRACE_ENTRY
|
||||
#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \
|
||||
#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
|
||||
FTRACE_ENTRY_REG(call, struct_name, etype, \
|
||||
PARAMS(tstruct), PARAMS(print), filter, NULL)
|
||||
PARAMS(tstruct), PARAMS(print), NULL)
|
||||
|
||||
bool ftrace_event_is_function(struct trace_event_call *call)
|
||||
{
|
||||
|
||||
@@ -1555,16 +1555,28 @@ static struct trace_event_functions kprobe_funcs = {
|
||||
.trace = print_kprobe_event
|
||||
};
|
||||
|
||||
static struct trace_event_fields kretprobe_fields_array[] = {
|
||||
{ .type = TRACE_FUNCTION_TYPE,
|
||||
.define_fields = kretprobe_event_define_fields },
|
||||
{}
|
||||
};
|
||||
|
||||
static struct trace_event_fields kprobe_fields_array[] = {
|
||||
{ .type = TRACE_FUNCTION_TYPE,
|
||||
.define_fields = kprobe_event_define_fields },
|
||||
{}
|
||||
};
|
||||
|
||||
static inline void init_trace_event_call(struct trace_kprobe *tk)
|
||||
{
|
||||
struct trace_event_call *call = trace_probe_event_call(&tk->tp);
|
||||
|
||||
if (trace_kprobe_is_return(tk)) {
|
||||
call->event.funcs = &kretprobe_funcs;
|
||||
call->class->define_fields = kretprobe_event_define_fields;
|
||||
call->class->fields_array = kretprobe_fields_array;
|
||||
} else {
|
||||
call->event.funcs = &kprobe_funcs;
|
||||
call->class->define_fields = kprobe_event_define_fields;
|
||||
call->class->fields_array = kprobe_fields_array;
|
||||
}
|
||||
|
||||
call->flags = TRACE_EVENT_FL_KPROBE;
|
||||
|
||||
@@ -876,7 +876,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
|
||||
for (i = 0; i < tp->nr_args; i++) {
|
||||
parg = tp->args + i;
|
||||
if (parg->count) {
|
||||
if (strcmp(parg->type->name, "string") == 0)
|
||||
if ((strcmp(parg->type->name, "string") == 0) ||
|
||||
(strcmp(parg->type->name, "ustring") == 0))
|
||||
fmt = ", __get_str(%s[%d])";
|
||||
else
|
||||
fmt = ", REC->%s[%d]";
|
||||
@@ -884,7 +885,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
|
||||
pos += snprintf(buf + pos, LEN_OR_ZERO,
|
||||
fmt, parg->name, j);
|
||||
} else {
|
||||
if (strcmp(parg->type->name, "string") == 0)
|
||||
if ((strcmp(parg->type->name, "string") == 0) ||
|
||||
(strcmp(parg->type->name, "ustring") == 0))
|
||||
fmt = ", __get_str(%s)";
|
||||
else
|
||||
fmt = ", REC->%s";
|
||||
|
||||
@@ -203,11 +203,10 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
|
||||
|
||||
extern char *__bad_type_size(void);
|
||||
|
||||
#define SYSCALL_FIELD(type, field, name) \
|
||||
sizeof(type) != sizeof(trace.field) ? \
|
||||
__bad_type_size() : \
|
||||
#type, #name, offsetof(typeof(trace), field), \
|
||||
sizeof(trace.field), is_signed_type(type)
|
||||
#define SYSCALL_FIELD(_type, _name) { \
|
||||
.type = #_type, .name = #_name, \
|
||||
.size = sizeof(_type), .align = __alignof__(_type), \
|
||||
.is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
|
||||
|
||||
static int __init
|
||||
__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
|
||||
@@ -274,42 +273,23 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
|
||||
{
|
||||
struct syscall_trace_enter trace;
|
||||
struct syscall_metadata *meta = call->data;
|
||||
int ret;
|
||||
int i;
|
||||
int offset = offsetof(typeof(trace), args);
|
||||
|
||||
ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
|
||||
FILTER_OTHER);
|
||||
if (ret)
|
||||
return ret;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < meta->nb_args; i++) {
|
||||
ret = trace_define_field(call, meta->types[i],
|
||||
meta->args[i], offset,
|
||||
sizeof(unsigned long), 0,
|
||||
FILTER_OTHER);
|
||||
if (ret)
|
||||
break;
|
||||
offset += sizeof(unsigned long);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __init syscall_exit_define_fields(struct trace_event_call *call)
|
||||
{
|
||||
struct syscall_trace_exit trace;
|
||||
int ret;
|
||||
|
||||
ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
|
||||
FILTER_OTHER);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret),
|
||||
FILTER_OTHER);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
|
||||
{
|
||||
struct trace_array *tr = data;
|
||||
@@ -507,6 +487,13 @@ static int __init init_syscall_trace(struct trace_event_call *call)
|
||||
return id;
|
||||
}
|
||||
|
||||
static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
|
||||
SYSCALL_FIELD(int, __syscall_nr),
|
||||
{ .type = TRACE_FUNCTION_TYPE,
|
||||
.define_fields = syscall_enter_define_fields },
|
||||
{}
|
||||
};
|
||||
|
||||
struct trace_event_functions enter_syscall_print_funcs = {
|
||||
.trace = print_syscall_enter,
|
||||
};
|
||||
@@ -518,7 +505,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
|
||||
struct trace_event_class __refdata event_class_syscall_enter = {
|
||||
.system = "syscalls",
|
||||
.reg = syscall_enter_register,
|
||||
.define_fields = syscall_enter_define_fields,
|
||||
.fields_array = syscall_enter_fields_array,
|
||||
.get_fields = syscall_get_enter_fields,
|
||||
.raw_init = init_syscall_trace,
|
||||
};
|
||||
@@ -526,7 +513,11 @@ struct trace_event_class __refdata event_class_syscall_enter = {
|
||||
struct trace_event_class __refdata event_class_syscall_exit = {
|
||||
.system = "syscalls",
|
||||
.reg = syscall_exit_register,
|
||||
.define_fields = syscall_exit_define_fields,
|
||||
.fields_array = (struct trace_event_fields[]){
|
||||
SYSCALL_FIELD(int, __syscall_nr),
|
||||
SYSCALL_FIELD(long, ret),
|
||||
{}
|
||||
},
|
||||
.fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
|
||||
.raw_init = init_syscall_trace,
|
||||
};
|
||||
|
||||
@@ -1528,12 +1528,17 @@ static struct trace_event_functions uprobe_funcs = {
|
||||
.trace = print_uprobe_event
|
||||
};
|
||||
|
||||
static struct trace_event_fields uprobe_fields_array[] = {
|
||||
{ .type = TRACE_FUNCTION_TYPE,
|
||||
.define_fields = uprobe_event_define_fields },
|
||||
{}
|
||||
};
|
||||
|
||||
static inline void init_trace_event_call(struct trace_uprobe *tu)
|
||||
{
|
||||
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
|
||||
|
||||
call->event.funcs = &uprobe_funcs;
|
||||
call->class->define_fields = uprobe_event_define_fields;
|
||||
call->class->fields_array = uprobe_fields_array;
|
||||
|
||||
call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY;
|
||||
call->class->reg = trace_uprobe_register;
|
||||
|
||||
@@ -2280,7 +2280,7 @@ __acquires(&pool->lock)
|
||||
}
|
||||
|
||||
/*
|
||||
* The following prevents a kworker from hogging CPU on !PREEMPT
|
||||
* The following prevents a kworker from hogging CPU on !PREEMPTION
|
||||
* kernels, where a requeueing work item waiting for something to
|
||||
* happen could deadlock with stop_machine as such work item could
|
||||
* indefinitely requeue itself while all other CPUs are trapped in
|
||||
|
||||
Reference in New Issue
Block a user