mptcp: allocate fwd memory separately on the rx and tx path

All the mptcp receive path is protected by the msk socket
spinlock. As consequences, the tx path has to play a few tricks to
allocate the forward memory without acquiring the spinlock multiple
times, making the overall TX path quite complex.

This patch tries to clean-up a bit the tx path, using completely
separated fwd memory allocation, for the rx and the tx path.

The forward memory allocated in the rx path is now accounted in
msk->rmem_fwd_alloc and is (still) protected by the msk socket spinlock.

To cope with the above we provide a few MPTCP-specific variants for
the helpers to charge, uncharge, reclaim and free the forward memory
in the receive path.

msk->sk_forward_alloc now accounts only the forward memory for the tx
path, we can use the plain core sock helper to manipulate it and drop
quite a bit of complexity.

On memory pressure, both rx and tx fwd memories are reclaimed.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Paolo Abeni 2021-10-26 16:29:15 -07:00 committed by Jakub Kicinski
parent 292e6077b0
commit 6511882cdd
2 changed files with 96 additions and 146 deletions

View File

@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
__kfree_skb(skb);
}
static void mptcp_rmem_charge(struct sock *sk, int size)
{
mptcp_sk(sk)->rmem_fwd_alloc -= size;
}
static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
struct sk_buff *from)
{
@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
kfree_skb_partial(from, fragstolen);
atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
mptcp_rmem_charge(sk, delta);
return true;
}
@ -155,6 +160,44 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
return mptcp_try_coalesce((struct sock *)msk, to, from);
}
static void __mptcp_rmem_reclaim(struct sock *sk, int amount)
{
amount >>= SK_MEM_QUANTUM_SHIFT;
mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
__sk_mem_reduce_allocated(sk, amount);
}
static void mptcp_rmem_uncharge(struct sock *sk, int size)
{
struct mptcp_sock *msk = mptcp_sk(sk);
int reclaimable;
msk->rmem_fwd_alloc += size;
reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
/* see sk_mem_uncharge() for the rationale behind the following schema */
if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
__mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK);
}
static void mptcp_rfree(struct sk_buff *skb)
{
unsigned int len = skb->truesize;
struct sock *sk = skb->sk;
atomic_sub(len, &sk->sk_rmem_alloc);
mptcp_rmem_uncharge(sk, len);
}
static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
skb->sk = sk;
skb->destructor = mptcp_rfree;
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
mptcp_rmem_charge(sk, skb->truesize);
}
/* "inspired" by tcp_data_queue_ofo(), main differences:
* - use mptcp seqs
* - don't cope with sacks
@ -267,7 +310,29 @@ merge_right:
end:
skb_condense(skb);
skb_set_owner_r(skb, sk);
mptcp_set_owner_r(skb, sk);
}
static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
{
struct mptcp_sock *msk = mptcp_sk(sk);
int amt, amount;
if (size < msk->rmem_fwd_alloc)
return true;
amt = sk_mem_pages(size);
amount = amt << SK_MEM_QUANTUM_SHIFT;
msk->rmem_fwd_alloc += amount;
if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
if (ssk->sk_forward_alloc < amount) {
msk->rmem_fwd_alloc -= amount;
return false;
}
ssk->sk_forward_alloc -= amount;
}
return true;
}
static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
@ -285,15 +350,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
skb_orphan(skb);
/* try to fetch required memory from subflow */
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT;
if (ssk->sk_forward_alloc < amount)
goto drop;
ssk->sk_forward_alloc -= amount;
sk->sk_forward_alloc += amount;
}
if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
goto drop;
has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
@ -313,7 +371,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
if (tail && mptcp_try_coalesce(sk, tail, skb))
return true;
skb_set_owner_r(skb, sk);
mptcp_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
return true;
} else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
@ -908,122 +966,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
df->data_seq + df->data_len == msk->write_seq;
}
static int mptcp_wmem_with_overhead(int size)
{
return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
}
static void __mptcp_wmem_reserve(struct sock *sk, int size)
{
int amount = mptcp_wmem_with_overhead(size);
struct mptcp_sock *msk = mptcp_sk(sk);
WARN_ON_ONCE(msk->wmem_reserved);
if (WARN_ON_ONCE(amount < 0))
amount = 0;
if (amount <= sk->sk_forward_alloc)
goto reserve;
/* under memory pressure try to reserve at most a single page
* otherwise try to reserve the full estimate and fallback
* to a single page before entering the error path
*/
if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
!sk_wmem_schedule(sk, amount)) {
if (amount <= PAGE_SIZE)
goto nomem;
amount = PAGE_SIZE;
if (!sk_wmem_schedule(sk, amount))
goto nomem;
}
reserve:
msk->wmem_reserved = amount;
sk->sk_forward_alloc -= amount;
return;
nomem:
/* we will wait for memory on next allocation */
msk->wmem_reserved = -1;
}
static void __mptcp_update_wmem(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
lockdep_assert_held_once(&sk->sk_lock.slock);
if (!msk->wmem_reserved)
return;
if (msk->wmem_reserved < 0)
msk->wmem_reserved = 0;
if (msk->wmem_reserved > 0) {
sk->sk_forward_alloc += msk->wmem_reserved;
msk->wmem_reserved = 0;
}
}
static bool mptcp_wmem_alloc(struct sock *sk, int size)
{
struct mptcp_sock *msk = mptcp_sk(sk);
/* check for pre-existing error condition */
if (msk->wmem_reserved < 0)
return false;
if (msk->wmem_reserved >= size)
goto account;
mptcp_data_lock(sk);
if (!sk_wmem_schedule(sk, size)) {
mptcp_data_unlock(sk);
return false;
}
sk->sk_forward_alloc -= size;
msk->wmem_reserved += size;
mptcp_data_unlock(sk);
account:
msk->wmem_reserved -= size;
return true;
}
static void mptcp_wmem_uncharge(struct sock *sk, int size)
{
struct mptcp_sock *msk = mptcp_sk(sk);
if (msk->wmem_reserved < 0)
msk->wmem_reserved = 0;
msk->wmem_reserved += size;
}
static void __mptcp_mem_reclaim_partial(struct sock *sk)
{
int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
lockdep_assert_held_once(&sk->sk_lock.slock);
__mptcp_update_wmem(sk);
__mptcp_rmem_reclaim(sk, reclaimable - 1);
sk_mem_reclaim_partial(sk);
}
static void mptcp_mem_reclaim_partial(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
/* if we are experiencing a transint allocation error,
* the forward allocation memory has been already
* released
*/
if (msk->wmem_reserved < 0)
return;
mptcp_data_lock(sk);
sk->sk_forward_alloc += msk->wmem_reserved;
sk_mem_reclaim_partial(sk);
msk->wmem_reserved = sk->sk_forward_alloc;
sk->sk_forward_alloc = 0;
__mptcp_mem_reclaim_partial(sk);
mptcp_data_unlock(sk);
}
@ -1664,7 +1620,6 @@ out:
/* __mptcp_alloc_tx_skb could have released some wmem and we are
* not going to flush it via release_sock()
*/
__mptcp_update_wmem(sk);
if (copied) {
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
@ -1701,7 +1656,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* silently ignore everything else */
msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
lock_sock(sk);
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@ -1749,17 +1704,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
psize = min_t(size_t, psize, msg_data_left(msg));
total_ts = psize + frag_truesize;
if (!mptcp_wmem_alloc(sk, total_ts))
if (!sk_wmem_schedule(sk, total_ts))
goto wait_for_memory;
if (copy_page_from_iter(dfrag->page, offset, psize,
&msg->msg_iter) != psize) {
mptcp_wmem_uncharge(sk, psize + frag_truesize);
ret = -EFAULT;
goto out;
}
/* data successfully copied into the write queue */
sk->sk_forward_alloc -= total_ts;
copied += psize;
dfrag->data_len += psize;
frag_truesize += psize;
@ -1956,7 +1911,7 @@ static void __mptcp_update_rmem(struct sock *sk)
return;
atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
sk_mem_uncharge(sk, msk->rmem_released);
mptcp_rmem_uncharge(sk, msk->rmem_released);
WRITE_ONCE(msk->rmem_released, 0);
}
@ -2024,7 +1979,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
lock_sock(sk);
if (unlikely(sk->sk_state == TCP_LISTEN)) {
copied = -ENOTCONN;
goto out_err;
@ -2504,7 +2459,7 @@ static int __mptcp_init_sock(struct sock *sk)
__skb_queue_head_init(&msk->receive_queue);
msk->out_of_order_queue = RB_ROOT;
msk->first_pending = NULL;
msk->wmem_reserved = 0;
msk->rmem_fwd_alloc = 0;
WRITE_ONCE(msk->rmem_released, 0);
msk->timer_ival = TCP_RTO_MIN;
@ -2715,7 +2670,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
sk->sk_prot->destroy(sk);
WARN_ON_ONCE(msk->wmem_reserved);
WARN_ON_ONCE(msk->rmem_fwd_alloc);
WARN_ON_ONCE(msk->rmem_released);
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
@ -2948,8 +2903,14 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_receive_queue);
skb_rbtree_purge(&msk->out_of_order_queue);
/* move all the rx fwd alloc into the sk_mem_reclaim_final in
* inet_sock_destruct() will dispose it
*/
sk->sk_forward_alloc += msk->rmem_fwd_alloc;
msk->rmem_fwd_alloc = 0;
mptcp_token_destroy(msk);
mptcp_pm_free_anno_list(msk);
}
@ -3031,10 +2992,6 @@ static void mptcp_release_cb(struct sock *sk)
if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags))
__mptcp_error_report(sk);
/* push_pending may touch wmem_reserved, ensure we do the cleanup
* later
*/
__mptcp_update_wmem(sk);
__mptcp_update_rmem(sk);
}
@ -3184,6 +3141,11 @@ static void mptcp_shutdown(struct sock *sk, int how)
__mptcp_wr_shutdown(sk);
}
static int mptcp_forward_alloc_get(const struct sock *sk)
{
return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc;
}
static struct proto mptcp_prot = {
.name = "MPTCP",
.owner = THIS_MODULE,
@ -3201,6 +3163,7 @@ static struct proto mptcp_prot = {
.hash = mptcp_hash,
.unhash = mptcp_unhash,
.get_port = mptcp_get_port,
.forward_alloc_get = mptcp_forward_alloc_get,
.sockets_allocated = &mptcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,

View File

@ -227,7 +227,7 @@ struct mptcp_sock {
u64 ack_seq;
u64 rcv_wnd_sent;
u64 rcv_data_fin_seq;
int wmem_reserved;
int rmem_fwd_alloc;
struct sock *last_snd;
int snd_burst;
int old_wspace;
@ -272,19 +272,6 @@ struct mptcp_sock {
char ca_name[TCP_CA_NAME_MAX];
};
#define mptcp_lock_sock(___sk, cb) do { \
struct sock *__sk = (___sk); /* silence macro reuse warning */ \
might_sleep(); \
spin_lock_bh(&__sk->sk_lock.slock); \
if (__sk->sk_lock.owned) \
__lock_sock(__sk); \
cb; \
__sk->sk_lock.owned = 1; \
spin_unlock(&__sk->sk_lock.slock); \
mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \
local_bh_enable(); \
} while (0)
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)