mirror of
https://github.com/torvalds/linux.git
synced 2024-11-26 06:02:05 +00:00
0b17ad25d8
Move skb->sk assignment out of sk_psock_bpf_run() and into individual
callers. Then we can use proper skb_set_owner_r() call to assign a
sk to a skb. This improves things by also charging the truesize against
the sockets sk_rmem_alloc counter. With this done we get some accounting
in place to ensure the memory associated with skbs on the workqueue are
still being accounted for somewhere. Finally, by using skb_set_owner_r
the destructor is setup so we can just let the normal skb_kfree logic
recover the memory. Combined with previous patch dropping skb_orphan()
we now can recover from memory pressure and maintain accounting.
Note, we will charge the skbs against their originating socket even
if being redirected into another socket. Once the skb completes the
redirect op the kfree_skb will give the memory back. This is important
because if we charged the socket we are redirecting to (like it was
done before this series) the sock_writeable() test could fail because
of the skb trying to be sent is already charged against the socket.
Also TLS case is special. Here we wait until we have decided not to
simply PASS the packet up the stack. In the case where we PASS the
packet up the stack we already have an skb which is accounted for on
the TLS socket context.
For the parser case we continue to just set/clear skb->sk this is
because the skb being used here may be combined with other skbs or
turned into multiple skbs depending on the parser logic. For example
the parser could request a payload length greater than skb->len so
that the strparser needs to collect multiple skbs. At any rate
the final result will be handled in the strparser recv callback.
Fixes: 604326b41a
("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160226867513.5692.10579573214635925960.stgit@john-Precision-5820-Tower
928 lines
21 KiB
C
928 lines
21 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
|
|
|
|
#include <linux/skmsg.h>
|
|
#include <linux/skbuff.h>
|
|
#include <linux/scatterlist.h>
|
|
|
|
#include <net/sock.h>
|
|
#include <net/tcp.h>
|
|
#include <net/tls.h>
|
|
|
|
static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
|
|
{
|
|
if (msg->sg.end > msg->sg.start &&
|
|
elem_first_coalesce < msg->sg.end)
|
|
return true;
|
|
|
|
if (msg->sg.end < msg->sg.start &&
|
|
(elem_first_coalesce > msg->sg.start ||
|
|
elem_first_coalesce < msg->sg.end))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
|
|
int elem_first_coalesce)
|
|
{
|
|
struct page_frag *pfrag = sk_page_frag(sk);
|
|
int ret = 0;
|
|
|
|
len -= msg->sg.size;
|
|
while (len > 0) {
|
|
struct scatterlist *sge;
|
|
u32 orig_offset;
|
|
int use, i;
|
|
|
|
if (!sk_page_frag_refill(sk, pfrag))
|
|
return -ENOMEM;
|
|
|
|
orig_offset = pfrag->offset;
|
|
use = min_t(int, len, pfrag->size - orig_offset);
|
|
if (!sk_wmem_schedule(sk, use))
|
|
return -ENOMEM;
|
|
|
|
i = msg->sg.end;
|
|
sk_msg_iter_var_prev(i);
|
|
sge = &msg->sg.data[i];
|
|
|
|
if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
|
|
sg_page(sge) == pfrag->page &&
|
|
sge->offset + sge->length == orig_offset) {
|
|
sge->length += use;
|
|
} else {
|
|
if (sk_msg_full(msg)) {
|
|
ret = -ENOSPC;
|
|
break;
|
|
}
|
|
|
|
sge = &msg->sg.data[msg->sg.end];
|
|
sg_unmark_end(sge);
|
|
sg_set_page(sge, pfrag->page, use, orig_offset);
|
|
get_page(pfrag->page);
|
|
sk_msg_iter_next(msg, end);
|
|
}
|
|
|
|
sk_mem_charge(sk, use);
|
|
msg->sg.size += use;
|
|
pfrag->offset += use;
|
|
len -= use;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_msg_alloc);
|
|
|
|
int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
|
|
u32 off, u32 len)
|
|
{
|
|
int i = src->sg.start;
|
|
struct scatterlist *sge = sk_msg_elem(src, i);
|
|
struct scatterlist *sgd = NULL;
|
|
u32 sge_len, sge_off;
|
|
|
|
while (off) {
|
|
if (sge->length > off)
|
|
break;
|
|
off -= sge->length;
|
|
sk_msg_iter_var_next(i);
|
|
if (i == src->sg.end && off)
|
|
return -ENOSPC;
|
|
sge = sk_msg_elem(src, i);
|
|
}
|
|
|
|
while (len) {
|
|
sge_len = sge->length - off;
|
|
if (sge_len > len)
|
|
sge_len = len;
|
|
|
|
if (dst->sg.end)
|
|
sgd = sk_msg_elem(dst, dst->sg.end - 1);
|
|
|
|
if (sgd &&
|
|
(sg_page(sge) == sg_page(sgd)) &&
|
|
(sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) {
|
|
sgd->length += sge_len;
|
|
dst->sg.size += sge_len;
|
|
} else if (!sk_msg_full(dst)) {
|
|
sge_off = sge->offset + off;
|
|
sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off);
|
|
} else {
|
|
return -ENOSPC;
|
|
}
|
|
|
|
off = 0;
|
|
len -= sge_len;
|
|
sk_mem_charge(sk, sge_len);
|
|
sk_msg_iter_var_next(i);
|
|
if (i == src->sg.end && len)
|
|
return -ENOSPC;
|
|
sge = sk_msg_elem(src, i);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_msg_clone);
|
|
|
|
void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
|
|
{
|
|
int i = msg->sg.start;
|
|
|
|
do {
|
|
struct scatterlist *sge = sk_msg_elem(msg, i);
|
|
|
|
if (bytes < sge->length) {
|
|
sge->length -= bytes;
|
|
sge->offset += bytes;
|
|
sk_mem_uncharge(sk, bytes);
|
|
break;
|
|
}
|
|
|
|
sk_mem_uncharge(sk, sge->length);
|
|
bytes -= sge->length;
|
|
sge->length = 0;
|
|
sge->offset = 0;
|
|
sk_msg_iter_var_next(i);
|
|
} while (bytes && i != msg->sg.end);
|
|
msg->sg.start = i;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_msg_return_zero);
|
|
|
|
void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
|
|
{
|
|
int i = msg->sg.start;
|
|
|
|
do {
|
|
struct scatterlist *sge = &msg->sg.data[i];
|
|
int uncharge = (bytes < sge->length) ? bytes : sge->length;
|
|
|
|
sk_mem_uncharge(sk, uncharge);
|
|
bytes -= uncharge;
|
|
sk_msg_iter_var_next(i);
|
|
} while (i != msg->sg.end);
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_msg_return);
|
|
|
|
static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
|
|
bool charge)
|
|
{
|
|
struct scatterlist *sge = sk_msg_elem(msg, i);
|
|
u32 len = sge->length;
|
|
|
|
if (charge)
|
|
sk_mem_uncharge(sk, len);
|
|
if (!msg->skb)
|
|
put_page(sg_page(sge));
|
|
memset(sge, 0, sizeof(*sge));
|
|
return len;
|
|
}
|
|
|
|
static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
|
|
bool charge)
|
|
{
|
|
struct scatterlist *sge = sk_msg_elem(msg, i);
|
|
int freed = 0;
|
|
|
|
while (msg->sg.size) {
|
|
msg->sg.size -= sge->length;
|
|
freed += sk_msg_free_elem(sk, msg, i, charge);
|
|
sk_msg_iter_var_next(i);
|
|
sk_msg_check_to_free(msg, i, msg->sg.size);
|
|
sge = sk_msg_elem(msg, i);
|
|
}
|
|
consume_skb(msg->skb);
|
|
sk_msg_init(msg);
|
|
return freed;
|
|
}
|
|
|
|
int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
|
|
{
|
|
return __sk_msg_free(sk, msg, msg->sg.start, false);
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
|
|
|
|
int sk_msg_free(struct sock *sk, struct sk_msg *msg)
|
|
{
|
|
return __sk_msg_free(sk, msg, msg->sg.start, true);
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_msg_free);
|
|
|
|
static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
|
|
u32 bytes, bool charge)
|
|
{
|
|
struct scatterlist *sge;
|
|
u32 i = msg->sg.start;
|
|
|
|
while (bytes) {
|
|
sge = sk_msg_elem(msg, i);
|
|
if (!sge->length)
|
|
break;
|
|
if (bytes < sge->length) {
|
|
if (charge)
|
|
sk_mem_uncharge(sk, bytes);
|
|
sge->length -= bytes;
|
|
sge->offset += bytes;
|
|
msg->sg.size -= bytes;
|
|
break;
|
|
}
|
|
|
|
msg->sg.size -= sge->length;
|
|
bytes -= sge->length;
|
|
sk_msg_free_elem(sk, msg, i, charge);
|
|
sk_msg_iter_var_next(i);
|
|
sk_msg_check_to_free(msg, i, bytes);
|
|
}
|
|
msg->sg.start = i;
|
|
}
|
|
|
|
void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
|
|
{
|
|
__sk_msg_free_partial(sk, msg, bytes, true);
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_msg_free_partial);
|
|
|
|
void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
|
|
u32 bytes)
|
|
{
|
|
__sk_msg_free_partial(sk, msg, bytes, false);
|
|
}
|
|
|
|
void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
|
|
{
|
|
int trim = msg->sg.size - len;
|
|
u32 i = msg->sg.end;
|
|
|
|
if (trim <= 0) {
|
|
WARN_ON(trim < 0);
|
|
return;
|
|
}
|
|
|
|
sk_msg_iter_var_prev(i);
|
|
msg->sg.size = len;
|
|
while (msg->sg.data[i].length &&
|
|
trim >= msg->sg.data[i].length) {
|
|
trim -= msg->sg.data[i].length;
|
|
sk_msg_free_elem(sk, msg, i, true);
|
|
sk_msg_iter_var_prev(i);
|
|
if (!trim)
|
|
goto out;
|
|
}
|
|
|
|
msg->sg.data[i].length -= trim;
|
|
sk_mem_uncharge(sk, trim);
|
|
/* Adjust copybreak if it falls into the trimmed part of last buf */
|
|
if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length)
|
|
msg->sg.copybreak = msg->sg.data[i].length;
|
|
out:
|
|
sk_msg_iter_var_next(i);
|
|
msg->sg.end = i;
|
|
|
|
/* If we trim data a full sg elem before curr pointer update
|
|
* copybreak and current so that any future copy operations
|
|
* start at new copy location.
|
|
* However trimed data that has not yet been used in a copy op
|
|
* does not require an update.
|
|
*/
|
|
if (!msg->sg.size) {
|
|
msg->sg.curr = msg->sg.start;
|
|
msg->sg.copybreak = 0;
|
|
} else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >=
|
|
sk_msg_iter_dist(msg->sg.start, msg->sg.end)) {
|
|
sk_msg_iter_var_prev(i);
|
|
msg->sg.curr = i;
|
|
msg->sg.copybreak = msg->sg.data[i].length;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_msg_trim);
|
|
|
|
int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
|
|
struct sk_msg *msg, u32 bytes)
|
|
{
|
|
int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
|
|
const int to_max_pages = MAX_MSG_FRAGS;
|
|
struct page *pages[MAX_MSG_FRAGS];
|
|
ssize_t orig, copied, use, offset;
|
|
|
|
orig = msg->sg.size;
|
|
while (bytes > 0) {
|
|
i = 0;
|
|
maxpages = to_max_pages - num_elems;
|
|
if (maxpages == 0) {
|
|
ret = -EFAULT;
|
|
goto out;
|
|
}
|
|
|
|
copied = iov_iter_get_pages(from, pages, bytes, maxpages,
|
|
&offset);
|
|
if (copied <= 0) {
|
|
ret = -EFAULT;
|
|
goto out;
|
|
}
|
|
|
|
iov_iter_advance(from, copied);
|
|
bytes -= copied;
|
|
msg->sg.size += copied;
|
|
|
|
while (copied) {
|
|
use = min_t(int, copied, PAGE_SIZE - offset);
|
|
sg_set_page(&msg->sg.data[msg->sg.end],
|
|
pages[i], use, offset);
|
|
sg_unmark_end(&msg->sg.data[msg->sg.end]);
|
|
sk_mem_charge(sk, use);
|
|
|
|
offset = 0;
|
|
copied -= use;
|
|
sk_msg_iter_next(msg, end);
|
|
num_elems++;
|
|
i++;
|
|
}
|
|
/* When zerocopy is mixed with sk_msg_*copy* operations we
|
|
* may have a copybreak set in this case clear and prefer
|
|
* zerocopy remainder when possible.
|
|
*/
|
|
msg->sg.copybreak = 0;
|
|
msg->sg.curr = msg->sg.end;
|
|
}
|
|
out:
|
|
/* Revert iov_iter updates, msg will need to use 'trim' later if it
|
|
* also needs to be cleared.
|
|
*/
|
|
if (ret)
|
|
iov_iter_revert(from, msg->sg.size - orig);
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
|
|
|
|
int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
|
|
struct sk_msg *msg, u32 bytes)
|
|
{
|
|
int ret = -ENOSPC, i = msg->sg.curr;
|
|
struct scatterlist *sge;
|
|
u32 copy, buf_size;
|
|
void *to;
|
|
|
|
do {
|
|
sge = sk_msg_elem(msg, i);
|
|
/* This is possible if a trim operation shrunk the buffer */
|
|
if (msg->sg.copybreak >= sge->length) {
|
|
msg->sg.copybreak = 0;
|
|
sk_msg_iter_var_next(i);
|
|
if (i == msg->sg.end)
|
|
break;
|
|
sge = sk_msg_elem(msg, i);
|
|
}
|
|
|
|
buf_size = sge->length - msg->sg.copybreak;
|
|
copy = (buf_size > bytes) ? bytes : buf_size;
|
|
to = sg_virt(sge) + msg->sg.copybreak;
|
|
msg->sg.copybreak += copy;
|
|
if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
|
|
ret = copy_from_iter_nocache(to, copy, from);
|
|
else
|
|
ret = copy_from_iter(to, copy, from);
|
|
if (ret != copy) {
|
|
ret = -EFAULT;
|
|
goto out;
|
|
}
|
|
bytes -= copy;
|
|
if (!bytes)
|
|
break;
|
|
msg->sg.copybreak = 0;
|
|
sk_msg_iter_var_next(i);
|
|
} while (i != msg->sg.end);
|
|
out:
|
|
msg->sg.curr = i;
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
|
|
|
|
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
|
|
{
|
|
struct sock *sk = psock->sk;
|
|
int copied = 0, num_sge;
|
|
struct sk_msg *msg;
|
|
|
|
msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
|
|
if (unlikely(!msg))
|
|
return -EAGAIN;
|
|
if (!sk_rmem_schedule(sk, skb, skb->len)) {
|
|
kfree(msg);
|
|
return -EAGAIN;
|
|
}
|
|
|
|
sk_msg_init(msg);
|
|
num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
|
|
if (unlikely(num_sge < 0)) {
|
|
kfree(msg);
|
|
return num_sge;
|
|
}
|
|
|
|
sk_mem_charge(sk, skb->len);
|
|
copied = skb->len;
|
|
msg->sg.start = 0;
|
|
msg->sg.size = copied;
|
|
msg->sg.end = num_sge;
|
|
msg->skb = skb;
|
|
|
|
sk_psock_queue_msg(psock, msg);
|
|
sk_psock_data_ready(sk, psock);
|
|
return copied;
|
|
}
|
|
|
|
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
|
|
u32 off, u32 len, bool ingress)
|
|
{
|
|
if (!ingress) {
|
|
if (!sock_writeable(psock->sk))
|
|
return -EAGAIN;
|
|
return skb_send_sock_locked(psock->sk, skb, off, len);
|
|
}
|
|
return sk_psock_skb_ingress(psock, skb);
|
|
}
|
|
|
|
static void sk_psock_backlog(struct work_struct *work)
|
|
{
|
|
struct sk_psock *psock = container_of(work, struct sk_psock, work);
|
|
struct sk_psock_work_state *state = &psock->work_state;
|
|
struct sk_buff *skb;
|
|
bool ingress;
|
|
u32 len, off;
|
|
int ret;
|
|
|
|
/* Lock sock to avoid losing sk_socket during loop. */
|
|
lock_sock(psock->sk);
|
|
if (state->skb) {
|
|
skb = state->skb;
|
|
len = state->len;
|
|
off = state->off;
|
|
state->skb = NULL;
|
|
goto start;
|
|
}
|
|
|
|
while ((skb = skb_dequeue(&psock->ingress_skb))) {
|
|
len = skb->len;
|
|
off = 0;
|
|
start:
|
|
ingress = tcp_skb_bpf_ingress(skb);
|
|
do {
|
|
ret = -EIO;
|
|
if (likely(psock->sk->sk_socket))
|
|
ret = sk_psock_handle_skb(psock, skb, off,
|
|
len, ingress);
|
|
if (ret <= 0) {
|
|
if (ret == -EAGAIN) {
|
|
state->skb = skb;
|
|
state->len = len;
|
|
state->off = off;
|
|
goto end;
|
|
}
|
|
/* Hard errors break pipe and stop xmit. */
|
|
sk_psock_report_error(psock, ret ? -ret : EPIPE);
|
|
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
|
|
kfree_skb(skb);
|
|
goto end;
|
|
}
|
|
off += ret;
|
|
len -= ret;
|
|
} while (len);
|
|
|
|
if (!ingress)
|
|
kfree_skb(skb);
|
|
}
|
|
end:
|
|
release_sock(psock->sk);
|
|
}
|
|
|
|
struct sk_psock *sk_psock_init(struct sock *sk, int node)
|
|
{
|
|
struct sk_psock *psock;
|
|
struct proto *prot;
|
|
|
|
write_lock_bh(&sk->sk_callback_lock);
|
|
|
|
if (inet_csk_has_ulp(sk)) {
|
|
psock = ERR_PTR(-EINVAL);
|
|
goto out;
|
|
}
|
|
|
|
if (sk->sk_user_data) {
|
|
psock = ERR_PTR(-EBUSY);
|
|
goto out;
|
|
}
|
|
|
|
psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node);
|
|
if (!psock) {
|
|
psock = ERR_PTR(-ENOMEM);
|
|
goto out;
|
|
}
|
|
|
|
prot = READ_ONCE(sk->sk_prot);
|
|
psock->sk = sk;
|
|
psock->eval = __SK_NONE;
|
|
psock->sk_proto = prot;
|
|
psock->saved_unhash = prot->unhash;
|
|
psock->saved_close = prot->close;
|
|
psock->saved_write_space = sk->sk_write_space;
|
|
|
|
INIT_LIST_HEAD(&psock->link);
|
|
spin_lock_init(&psock->link_lock);
|
|
|
|
INIT_WORK(&psock->work, sk_psock_backlog);
|
|
INIT_LIST_HEAD(&psock->ingress_msg);
|
|
skb_queue_head_init(&psock->ingress_skb);
|
|
|
|
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
|
|
refcount_set(&psock->refcnt, 1);
|
|
|
|
rcu_assign_sk_user_data_nocopy(sk, psock);
|
|
sock_hold(sk);
|
|
|
|
out:
|
|
write_unlock_bh(&sk->sk_callback_lock);
|
|
return psock;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_psock_init);
|
|
|
|
struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
|
|
{
|
|
struct sk_psock_link *link;
|
|
|
|
spin_lock_bh(&psock->link_lock);
|
|
link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
|
|
list);
|
|
if (link)
|
|
list_del(&link->list);
|
|
spin_unlock_bh(&psock->link_lock);
|
|
return link;
|
|
}
|
|
|
|
void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
|
|
{
|
|
struct sk_msg *msg, *tmp;
|
|
|
|
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
|
|
list_del(&msg->list);
|
|
sk_msg_free(psock->sk, msg);
|
|
kfree(msg);
|
|
}
|
|
}
|
|
|
|
static void sk_psock_zap_ingress(struct sk_psock *psock)
|
|
{
|
|
__skb_queue_purge(&psock->ingress_skb);
|
|
__sk_psock_purge_ingress_msg(psock);
|
|
}
|
|
|
|
static void sk_psock_link_destroy(struct sk_psock *psock)
|
|
{
|
|
struct sk_psock_link *link, *tmp;
|
|
|
|
list_for_each_entry_safe(link, tmp, &psock->link, list) {
|
|
list_del(&link->list);
|
|
sk_psock_free_link(link);
|
|
}
|
|
}
|
|
|
|
static void sk_psock_destroy_deferred(struct work_struct *gc)
|
|
{
|
|
struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
|
|
|
|
/* No sk_callback_lock since already detached. */
|
|
|
|
/* Parser has been stopped */
|
|
if (psock->progs.skb_parser)
|
|
strp_done(&psock->parser.strp);
|
|
|
|
cancel_work_sync(&psock->work);
|
|
|
|
psock_progs_drop(&psock->progs);
|
|
|
|
sk_psock_link_destroy(psock);
|
|
sk_psock_cork_free(psock);
|
|
sk_psock_zap_ingress(psock);
|
|
|
|
if (psock->sk_redir)
|
|
sock_put(psock->sk_redir);
|
|
sock_put(psock->sk);
|
|
kfree(psock);
|
|
}
|
|
|
|
void sk_psock_destroy(struct rcu_head *rcu)
|
|
{
|
|
struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
|
|
|
|
INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
|
|
schedule_work(&psock->gc);
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_psock_destroy);
|
|
|
|
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
|
|
{
|
|
sk_psock_cork_free(psock);
|
|
sk_psock_zap_ingress(psock);
|
|
|
|
write_lock_bh(&sk->sk_callback_lock);
|
|
sk_psock_restore_proto(sk, psock);
|
|
rcu_assign_sk_user_data(sk, NULL);
|
|
if (psock->progs.skb_parser)
|
|
sk_psock_stop_strp(sk, psock);
|
|
write_unlock_bh(&sk->sk_callback_lock);
|
|
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
|
|
|
|
call_rcu(&psock->rcu, sk_psock_destroy);
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_psock_drop);
|
|
|
|
static int sk_psock_map_verd(int verdict, bool redir)
|
|
{
|
|
switch (verdict) {
|
|
case SK_PASS:
|
|
return redir ? __SK_REDIRECT : __SK_PASS;
|
|
case SK_DROP:
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return __SK_DROP;
|
|
}
|
|
|
|
int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
|
|
struct sk_msg *msg)
|
|
{
|
|
struct bpf_prog *prog;
|
|
int ret;
|
|
|
|
rcu_read_lock();
|
|
prog = READ_ONCE(psock->progs.msg_parser);
|
|
if (unlikely(!prog)) {
|
|
ret = __SK_PASS;
|
|
goto out;
|
|
}
|
|
|
|
sk_msg_compute_data_pointers(msg);
|
|
msg->sk = sk;
|
|
ret = bpf_prog_run_pin_on_cpu(prog, msg);
|
|
ret = sk_psock_map_verd(ret, msg->sk_redir);
|
|
psock->apply_bytes = msg->apply_bytes;
|
|
if (ret == __SK_REDIRECT) {
|
|
if (psock->sk_redir)
|
|
sock_put(psock->sk_redir);
|
|
psock->sk_redir = msg->sk_redir;
|
|
if (!psock->sk_redir) {
|
|
ret = __SK_DROP;
|
|
goto out;
|
|
}
|
|
sock_hold(psock->sk_redir);
|
|
}
|
|
out:
|
|
rcu_read_unlock();
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
|
|
|
|
static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
|
|
struct sk_buff *skb)
|
|
{
|
|
bpf_compute_data_end_sk_skb(skb);
|
|
return bpf_prog_run_pin_on_cpu(prog, skb);
|
|
}
|
|
|
|
static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
|
|
{
|
|
struct sk_psock_parser *parser;
|
|
|
|
parser = container_of(strp, struct sk_psock_parser, strp);
|
|
return container_of(parser, struct sk_psock, parser);
|
|
}
|
|
|
|
static void sk_psock_skb_redirect(struct sk_buff *skb)
|
|
{
|
|
struct sk_psock *psock_other;
|
|
struct sock *sk_other;
|
|
|
|
sk_other = tcp_skb_bpf_redirect_fetch(skb);
|
|
/* This error is a buggy BPF program, it returned a redirect
|
|
* return code, but then didn't set a redirect interface.
|
|
*/
|
|
if (unlikely(!sk_other)) {
|
|
kfree_skb(skb);
|
|
return;
|
|
}
|
|
psock_other = sk_psock(sk_other);
|
|
/* This error indicates the socket is being torn down or had another
|
|
* error that caused the pipe to break. We can't send a packet on
|
|
* a socket that is in this state so we drop the skb.
|
|
*/
|
|
if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
|
|
!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
|
|
kfree_skb(skb);
|
|
return;
|
|
}
|
|
|
|
skb_queue_tail(&psock_other->ingress_skb, skb);
|
|
schedule_work(&psock_other->work);
|
|
}
|
|
|
|
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
|
|
{
|
|
switch (verdict) {
|
|
case __SK_REDIRECT:
|
|
skb_set_owner_r(skb, sk);
|
|
sk_psock_skb_redirect(skb);
|
|
break;
|
|
case __SK_PASS:
|
|
case __SK_DROP:
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
|
|
{
|
|
struct bpf_prog *prog;
|
|
int ret = __SK_PASS;
|
|
|
|
rcu_read_lock();
|
|
prog = READ_ONCE(psock->progs.skb_verdict);
|
|
if (likely(prog)) {
|
|
/* We skip full set_owner_r here because if we do a SK_PASS
|
|
* or SK_DROP we can skip skb memory accounting and use the
|
|
* TLS context.
|
|
*/
|
|
skb->sk = psock->sk;
|
|
tcp_skb_bpf_redirect_clear(skb);
|
|
ret = sk_psock_bpf_run(psock, prog, skb);
|
|
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
|
|
skb->sk = NULL;
|
|
}
|
|
sk_psock_tls_verdict_apply(skb, psock->sk, ret);
|
|
rcu_read_unlock();
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
|
|
|
|
static void sk_psock_verdict_apply(struct sk_psock *psock,
|
|
struct sk_buff *skb, int verdict)
|
|
{
|
|
struct tcp_skb_cb *tcp;
|
|
struct sock *sk_other;
|
|
int err = -EIO;
|
|
|
|
switch (verdict) {
|
|
case __SK_PASS:
|
|
sk_other = psock->sk;
|
|
if (sock_flag(sk_other, SOCK_DEAD) ||
|
|
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
|
|
goto out_free;
|
|
}
|
|
|
|
tcp = TCP_SKB_CB(skb);
|
|
tcp->bpf.flags |= BPF_F_INGRESS;
|
|
|
|
/* If the queue is empty then we can submit directly
|
|
* into the msg queue. If its not empty we have to
|
|
* queue work otherwise we may get OOO data. Otherwise,
|
|
* if sk_psock_skb_ingress errors will be handled by
|
|
* retrying later from workqueue.
|
|
*/
|
|
if (skb_queue_empty(&psock->ingress_skb)) {
|
|
err = sk_psock_skb_ingress(psock, skb);
|
|
}
|
|
if (err < 0) {
|
|
skb_queue_tail(&psock->ingress_skb, skb);
|
|
schedule_work(&psock->work);
|
|
}
|
|
break;
|
|
case __SK_REDIRECT:
|
|
sk_psock_skb_redirect(skb);
|
|
break;
|
|
case __SK_DROP:
|
|
default:
|
|
out_free:
|
|
kfree_skb(skb);
|
|
}
|
|
}
|
|
|
|
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
|
|
{
|
|
struct sk_psock *psock;
|
|
struct bpf_prog *prog;
|
|
int ret = __SK_DROP;
|
|
struct sock *sk;
|
|
|
|
rcu_read_lock();
|
|
sk = strp->sk;
|
|
psock = sk_psock(sk);
|
|
if (unlikely(!psock)) {
|
|
kfree_skb(skb);
|
|
goto out;
|
|
}
|
|
skb_set_owner_r(skb, sk);
|
|
prog = READ_ONCE(psock->progs.skb_verdict);
|
|
if (likely(prog)) {
|
|
tcp_skb_bpf_redirect_clear(skb);
|
|
ret = sk_psock_bpf_run(psock, prog, skb);
|
|
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
|
|
}
|
|
sk_psock_verdict_apply(psock, skb, ret);
|
|
out:
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
static int sk_psock_strp_read_done(struct strparser *strp, int err)
|
|
{
|
|
return err;
|
|
}
|
|
|
|
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
|
|
{
|
|
struct sk_psock *psock = sk_psock_from_strp(strp);
|
|
struct bpf_prog *prog;
|
|
int ret = skb->len;
|
|
|
|
rcu_read_lock();
|
|
prog = READ_ONCE(psock->progs.skb_parser);
|
|
if (likely(prog)) {
|
|
skb->sk = psock->sk;
|
|
ret = sk_psock_bpf_run(psock, prog, skb);
|
|
skb->sk = NULL;
|
|
}
|
|
rcu_read_unlock();
|
|
return ret;
|
|
}
|
|
|
|
/* Called with socket lock held. */
|
|
static void sk_psock_strp_data_ready(struct sock *sk)
|
|
{
|
|
struct sk_psock *psock;
|
|
|
|
rcu_read_lock();
|
|
psock = sk_psock(sk);
|
|
if (likely(psock)) {
|
|
if (tls_sw_has_ctx_rx(sk)) {
|
|
psock->parser.saved_data_ready(sk);
|
|
} else {
|
|
write_lock_bh(&sk->sk_callback_lock);
|
|
strp_data_ready(&psock->parser.strp);
|
|
write_unlock_bh(&sk->sk_callback_lock);
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
static void sk_psock_write_space(struct sock *sk)
|
|
{
|
|
struct sk_psock *psock;
|
|
void (*write_space)(struct sock *sk) = NULL;
|
|
|
|
rcu_read_lock();
|
|
psock = sk_psock(sk);
|
|
if (likely(psock)) {
|
|
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
|
|
schedule_work(&psock->work);
|
|
write_space = psock->saved_write_space;
|
|
}
|
|
rcu_read_unlock();
|
|
if (write_space)
|
|
write_space(sk);
|
|
}
|
|
|
|
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
|
|
{
|
|
static const struct strp_callbacks cb = {
|
|
.rcv_msg = sk_psock_strp_read,
|
|
.read_sock_done = sk_psock_strp_read_done,
|
|
.parse_msg = sk_psock_strp_parse,
|
|
};
|
|
|
|
psock->parser.enabled = false;
|
|
return strp_init(&psock->parser.strp, sk, &cb);
|
|
}
|
|
|
|
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
|
|
{
|
|
struct sk_psock_parser *parser = &psock->parser;
|
|
|
|
if (parser->enabled)
|
|
return;
|
|
|
|
parser->saved_data_ready = sk->sk_data_ready;
|
|
sk->sk_data_ready = sk_psock_strp_data_ready;
|
|
sk->sk_write_space = sk_psock_write_space;
|
|
parser->enabled = true;
|
|
}
|
|
|
|
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
|
|
{
|
|
struct sk_psock_parser *parser = &psock->parser;
|
|
|
|
if (!parser->enabled)
|
|
return;
|
|
|
|
sk->sk_data_ready = parser->saved_data_ready;
|
|
parser->saved_data_ready = NULL;
|
|
strp_stop(&parser->strp);
|
|
parser->enabled = false;
|
|
}
|