Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2020-10-12

The main changes are:

1) The BPF verifier improvements to track register allocation pattern, from Alexei and Yonghong.

2) libbpf relocation support for different size load/store, from Andrii.

3) bpf_redirect_peer() helper and support for inner map array with different max_entries, from Daniel.

4) BPF support for per-cpu variables, form Hao.

5) sockmap improvements, from John.
====================

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski
2020-10-12 16:16:50 -07:00
73 changed files with 4334 additions and 767 deletions

View File

@@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev)
struct net_device *orig_dev, bool *another)
{
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
@@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
skb_do_redirect(skb);
if (skb_do_redirect(skb) == -EAGAIN) {
__skb_pull(skb, skb->mac_len);
*another = true;
break;
}
return NULL;
case TC_ACT_CONSUMED:
return NULL;
@@ -5163,7 +5167,12 @@ another_round:
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
bool another = false;
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
&another);
if (another)
goto another_round;
if (!skb)
goto out;

View File

@@ -76,6 +76,7 @@
#include <net/bpf_sk_storage.h>
#include <net/transp_v6.h>
#include <linux/btf_ids.h>
#include <net/tls.h>
static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -2379,8 +2380,9 @@ out:
/* Internal, non-exposed redirect flags. */
enum {
BPF_F_NEIGH = (1ULL << 1),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH)
BPF_F_NEIGH = (1ULL << 1),
BPF_F_PEER = (1ULL << 2),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER)
};
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2429,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
int skb_do_redirect(struct sk_buff *skb)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net *net = dev_net(skb->dev);
struct net_device *dev;
u32 flags = ri->flags;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
dev = dev_get_by_index_rcu(net, ri->tgt_index);
ri->tgt_index = 0;
if (unlikely(!dev)) {
kfree_skb(skb);
return -EINVAL;
}
ri->flags = 0;
if (unlikely(!dev))
goto out_drop;
if (flags & BPF_F_PEER) {
const struct net_device_ops *ops = dev->netdev_ops;
if (unlikely(!ops->ndo_get_peer_dev ||
!skb_at_tc_ingress(skb)))
goto out_drop;
dev = ops->ndo_get_peer_dev(dev);
if (unlikely(!dev ||
!is_skb_forwardable(dev, skb) ||
net_eq(net, dev_net(dev))))
goto out_drop;
skb->dev = dev;
return -EAGAIN;
}
return flags & BPF_F_NEIGH ?
__bpf_redirect_neigh(skb, dev) :
__bpf_redirect(skb, dev, flags);
out_drop:
kfree_skb(skb);
return -EINVAL;
}
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
@@ -2465,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags))
return TC_ACT_SHOT;
ri->flags = BPF_F_PEER;
ri->tgt_index = ifindex;
return TC_ACT_REDIRECT;
}
static const struct bpf_func_proto bpf_redirect_peer_proto = {
.func = bpf_redirect_peer,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
@@ -3479,6 +3518,48 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb)
SKB_MAX_ALLOC;
}
BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{
u32 len_diff_abs = abs(len_diff);
bool shrink = len_diff < 0;
int ret = 0;
if (unlikely(flags || mode))
return -EINVAL;
if (unlikely(len_diff_abs > 0xfffU))
return -EFAULT;
if (!shrink) {
ret = skb_cow(skb, len_diff);
if (unlikely(ret < 0))
return ret;
__skb_push(skb, len_diff_abs);
memset(skb->data, 0, len_diff_abs);
} else {
if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
return -ENOMEM;
__skb_pull(skb, len_diff_abs);
}
bpf_compute_data_end_sk_skb(skb);
if (tls_sw_has_ctx_rx(skb->sk)) {
struct strp_msg *rxm = strp_msg(skb);
rxm->full_len += len_diff;
}
return ret;
}
static const struct bpf_func_proto sk_skb_adjust_room_proto = {
.func = sk_skb_adjust_room,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{
@@ -4784,6 +4865,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
else
icsk->icsk_user_timeout = val;
break;
case TCP_NOTSENT_LOWAT:
tp->notsent_lowat = val;
sk->sk_write_space(sk);
break;
default:
ret = -EINVAL;
}
@@ -5149,7 +5234,6 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
memcpy(params->smac, dev->dev_addr, ETH_ALEN);
params->h_vlan_TCI = 0;
params->h_vlan_proto = 0;
params->ifindex = dev->ifindex;
return 0;
}
@@ -5246,6 +5330,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
dev = nhc->nhc_dev;
params->rt_metric = res.fi->fib_priority;
params->ifindex = dev->ifindex;
/* xdp and cls_bpf programs are run in RCU-bh so
* rcu_read_lock_bh is not needed here
@@ -5371,6 +5456,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
dev = res.nh->fib_nh_dev;
params->rt_metric = res.f6i->fib6_metric;
params->ifindex = dev->ifindex;
/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
* not needed here.
@@ -6745,6 +6831,7 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_skb_change_tail ||
func == sk_skb_change_tail ||
func == bpf_skb_adjust_room ||
func == sk_skb_adjust_room ||
func == bpf_skb_pull_data ||
func == sk_skb_pull_data ||
func == bpf_clone_redirect ||
@@ -7005,6 +7092,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_redirect_proto;
case BPF_FUNC_redirect_neigh:
return &bpf_redirect_neigh_proto;
case BPF_FUNC_redirect_peer:
return &bpf_redirect_peer_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
@@ -7218,6 +7307,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &sk_skb_change_tail_proto;
case BPF_FUNC_skb_change_head:
return &sk_skb_change_head_proto;
case BPF_FUNC_skb_adjust_room:
return &sk_skb_adjust_room_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:

View File

@@ -433,10 +433,12 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len, bool ingress)
{
if (ingress)
return sk_psock_skb_ingress(psock, skb);
else
if (!ingress) {
if (!sock_writeable(psock->sk))
return -EAGAIN;
return skb_send_sock_locked(psock->sk, skb, off, len);
}
return sk_psock_skb_ingress(psock, skb);
}
static void sk_psock_backlog(struct work_struct *work)
@@ -625,6 +627,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
rcu_assign_sk_user_data(sk, NULL);
if (psock->progs.skb_parser)
sk_psock_stop_strp(sk, psock);
else if (psock->progs.skb_verdict)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
@@ -682,19 +686,8 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
struct sk_buff *skb)
{
int ret;
skb->sk = psock->sk;
bpf_compute_data_end_sk_skb(skb);
ret = bpf_prog_run_pin_on_cpu(prog, skb);
/* strparser clones the skb before handing it to a upper layer,
* meaning skb_orphan has been called. We NULL sk on the way out
* to ensure we don't trigger a BUG_ON() in skb/sk operations
* later and because we are not charging the memory of this skb
* to any socket yet.
*/
skb->sk = NULL;
return ret;
return bpf_prog_run_pin_on_cpu(prog, skb);
}
static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
@@ -709,38 +702,35 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
bool ingress;
sk_other = tcp_skb_bpf_redirect_fetch(skb);
/* This error is a buggy BPF program, it returned a redirect
* return code, but then didn't set a redirect interface.
*/
if (unlikely(!sk_other)) {
kfree_skb(skb);
return;
}
psock_other = sk_psock(sk_other);
/* This error indicates the socket is being torn down or had another
* error that caused the pipe to break. We can't send a packet on
* a socket that is in this state so we drop the skb.
*/
if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
kfree_skb(skb);
return;
}
ingress = tcp_skb_bpf_ingress(skb);
if ((!ingress && sock_writeable(sk_other)) ||
(ingress &&
atomic_read(&sk_other->sk_rmem_alloc) <=
sk_other->sk_rcvbuf)) {
if (!ingress)
skb_set_owner_w(skb, sk_other);
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
} else {
kfree_skb(skb);
}
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
}
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
{
switch (verdict) {
case __SK_REDIRECT:
skb_set_owner_r(skb, sk);
sk_psock_skb_redirect(skb);
break;
case __SK_PASS:
@@ -758,11 +748,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
rcu_read_lock();
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
/* We skip full set_owner_r here because if we do a SK_PASS
* or SK_DROP we can skip skb memory accounting and use the
* TLS context.
*/
skb->sk = psock->sk;
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_tls_verdict_apply(skb, ret);
sk_psock_tls_verdict_apply(skb, psock->sk, ret);
rcu_read_unlock();
return ret;
}
@@ -771,7 +767,9 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
struct tcp_skb_cb *tcp;
struct sock *sk_other;
int err = -EIO;
switch (verdict) {
case __SK_PASS:
@@ -780,16 +778,24 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
goto out_free;
}
if (atomic_read(&sk_other->sk_rmem_alloc) <=
sk_other->sk_rcvbuf) {
struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
tcp->bpf.flags |= BPF_F_INGRESS;
tcp = TCP_SKB_CB(skb);
tcp->bpf.flags |= BPF_F_INGRESS;
/* If the queue is empty then we can submit directly
* into the msg queue. If its not empty we have to
* queue work otherwise we may get OOO data. Otherwise,
* if sk_psock_skb_ingress errors will be handled by
* retrying later from workqueue.
*/
if (skb_queue_empty(&psock->ingress_skb)) {
err = sk_psock_skb_ingress(psock, skb);
}
if (err < 0) {
skb_queue_tail(&psock->ingress_skb, skb);
schedule_work(&psock->work);
break;
}
goto out_free;
break;
case __SK_REDIRECT:
sk_psock_skb_redirect(skb);
break;
@@ -814,9 +820,9 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
kfree_skb(skb);
goto out;
}
skb_set_owner_r(skb, sk);
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
skb_orphan(skb);
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
@@ -839,8 +845,11 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
rcu_read_lock();
prog = READ_ONCE(psock->progs.skb_parser);
if (likely(prog))
if (likely(prog)) {
skb->sk = psock->sk;
ret = sk_psock_bpf_run(psock, prog, skb);
skb->sk = NULL;
}
rcu_read_unlock();
return ret;
}
@@ -864,6 +873,57 @@ static void sk_psock_strp_data_ready(struct sock *sk)
rcu_read_unlock();
}
static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t orig_len)
{
struct sock *sk = (struct sock *)desc->arg.data;
struct sk_psock *psock;
struct bpf_prog *prog;
int ret = __SK_DROP;
int len = skb->len;
/* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb) {
desc->error = -ENOMEM;
return 0;
}
rcu_read_lock();
psock = sk_psock(sk);
if (unlikely(!psock)) {
len = 0;
kfree_skb(skb);
goto out;
}
skb_set_owner_r(skb, sk);
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
}
sk_psock_verdict_apply(psock, skb, ret);
out:
rcu_read_unlock();
return len;
}
static void sk_psock_verdict_data_ready(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
read_descriptor_t desc;
if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
return;
desc.arg.data = sk;
desc.error = 0;
desc.count = 1;
sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
}
static void sk_psock_write_space(struct sock *sk)
{
struct sk_psock *psock;
@@ -893,6 +953,19 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
return strp_init(&psock->parser.strp, sk, &cb);
}
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
if (parser->enabled)
return;
parser->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_verdict_data_ready;
sk->sk_write_space = sk_psock_write_space;
parser->enabled = true;
}
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
@@ -918,3 +991,15 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
strp_stop(&parser->strp);
parser->enabled = false;
}
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
if (!parser->enabled)
return;
sk->sk_data_ready = parser->saved_data_ready;
parser->saved_data_ready = NULL;
parser->enabled = false;
}

View File

@@ -148,8 +148,8 @@ static void sock_map_add_link(struct sk_psock *psock,
static void sock_map_del_link(struct sock *sk,
struct sk_psock *psock, void *link_raw)
{
bool strp_stop = false, verdict_stop = false;
struct sk_psock_link *link, *tmp;
bool strp_stop = false;
spin_lock_bh(&psock->link_lock);
list_for_each_entry_safe(link, tmp, &psock->link, list) {
@@ -159,14 +159,19 @@ static void sock_map_del_link(struct sock *sk,
map);
if (psock->parser.enabled && stab->progs.skb_parser)
strp_stop = true;
if (psock->parser.enabled && stab->progs.skb_verdict)
verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
}
}
spin_unlock_bh(&psock->link_lock);
if (strp_stop) {
if (strp_stop || verdict_stop) {
write_lock_bh(&sk->sk_callback_lock);
sk_psock_stop_strp(sk, psock);
if (strp_stop)
sk_psock_stop_strp(sk, psock);
else
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
}
}
@@ -230,16 +235,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
{
struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
struct sk_psock *psock;
bool skb_progs;
int ret;
skb_verdict = READ_ONCE(progs->skb_verdict);
skb_parser = READ_ONCE(progs->skb_parser);
skb_progs = skb_parser && skb_verdict;
if (skb_progs) {
if (skb_verdict) {
skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
if (IS_ERR(skb_verdict))
return PTR_ERR(skb_verdict);
}
if (skb_parser) {
skb_parser = bpf_prog_inc_not_zero(skb_parser);
if (IS_ERR(skb_parser)) {
bpf_prog_put(skb_verdict);
@@ -264,7 +269,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
if (psock) {
if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
(skb_progs && READ_ONCE(psock->progs.skb_parser))) {
(skb_parser && READ_ONCE(psock->progs.skb_parser)) ||
(skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
sk_psock_put(sk, psock);
ret = -EBUSY;
goto out_progs;
@@ -285,28 +291,31 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
goto out_drop;
write_lock_bh(&sk->sk_callback_lock);
if (skb_progs && !psock->parser.enabled) {
if (skb_parser && skb_verdict && !psock->parser.enabled) {
ret = sk_psock_init_strp(sk, psock);
if (ret) {
write_unlock_bh(&sk->sk_callback_lock);
goto out_drop;
}
if (ret)
goto out_unlock_drop;
psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
psock_set_prog(&psock->progs.skb_parser, skb_parser);
sk_psock_start_strp(sk, psock);
} else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
sk_psock_start_verdict(sk,psock);
}
write_unlock_bh(&sk->sk_callback_lock);
return 0;
out_unlock_drop:
write_unlock_bh(&sk->sk_callback_lock);
out_drop:
sk_psock_put(sk, psock);
out_progs:
if (msg_parser)
bpf_prog_put(msg_parser);
out:
if (skb_progs) {
if (skb_verdict)
bpf_prog_put(skb_verdict);
if (skb_parser)
bpf_prog_put(skb_parser);
}
return ret;
}