mirror of
https://github.com/torvalds/linux.git
synced 2024-11-28 23:21:31 +00:00
mptcp: rework mptcp_sendmsg_frag to accept optional dfrag
This will simplify mptcp-level retransmission implementation in the next patch. If dfrag is provided by the caller, skip kernel space memory allocation and use data and metadata provided by the dfrag itself. Because a peer could ack data at TCP level but refrain from sending mptcp-level ACKs, we could grow the mptcp socket backlog indefinitely. We should thus block mptcp_sendmsg until the peer has acked some of the sent data. In order to be able to do so, increment the mptcp socket wmem_queued counter on memory allocation and decrement it when releasing the memory on mptcp-level ack reception. Because TCP performns sndbuf auto-tuning up to tcp_wmem_max[2], make this the mptcp sk_sndbuf limit. In the future we could add experiment with autotuning as TCP does in tcp_sndbuf_expand(). v2 -> v3: - remove 'inline' in foo.c files (David S. Miller) Co-developed-by: Florian Westphal <fw@strlen.de> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
7948f6cc99
commit
3f8e0aae17
@ -316,15 +316,15 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
|
||||
const struct sk_buff *skb,
|
||||
const struct mptcp_ext *mpext)
|
||||
static bool mptcp_skb_can_collapse_to(u64 write_seq,
|
||||
const struct sk_buff *skb,
|
||||
const struct mptcp_ext *mpext)
|
||||
{
|
||||
if (!tcp_skb_can_collapse_to(skb))
|
||||
return false;
|
||||
|
||||
/* can collapse only if MPTCP level sequence is in order */
|
||||
return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
|
||||
return mpext && mpext->data_seq + mpext->data_len == write_seq;
|
||||
}
|
||||
|
||||
static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
|
||||
@ -417,23 +417,28 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
|
||||
}
|
||||
|
||||
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
|
||||
struct msghdr *msg, long *timeo, int *pmss_now,
|
||||
struct msghdr *msg, struct mptcp_data_frag *dfrag,
|
||||
long *timeo, int *pmss_now,
|
||||
int *ps_goal)
|
||||
{
|
||||
int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
|
||||
bool dfrag_collapsed, can_collapse = false;
|
||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
||||
struct mptcp_ext *mpext = NULL;
|
||||
struct mptcp_data_frag *dfrag;
|
||||
bool retransmission = !!dfrag;
|
||||
struct sk_buff *skb, *tail;
|
||||
struct page_frag *pfrag;
|
||||
struct page *page;
|
||||
u64 *write_seq;
|
||||
size_t psize;
|
||||
|
||||
/* use the mptcp page cache so that we can easily move the data
|
||||
* from one substream to another, but do per subflow memory accounting
|
||||
* Note: pfrag is used only !retransmission, but the compiler if
|
||||
* fooled into a warning if we don't init here
|
||||
*/
|
||||
pfrag = sk_page_frag(sk);
|
||||
while (!mptcp_page_frag_refill(ssk, pfrag) ||
|
||||
while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
|
||||
!mptcp_ext_cache_refill(msk)) {
|
||||
ret = sk_stream_wait_memory(ssk, timeo);
|
||||
if (ret)
|
||||
@ -447,6 +452,13 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
|
||||
if (unlikely(__mptcp_needs_tcp_fallback(msk)))
|
||||
return 0;
|
||||
}
|
||||
if (!retransmission) {
|
||||
write_seq = &msk->write_seq;
|
||||
page = pfrag->page;
|
||||
} else {
|
||||
write_seq = &dfrag->data_seq;
|
||||
page = dfrag->page;
|
||||
}
|
||||
|
||||
/* compute copy limit */
|
||||
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
|
||||
@ -464,64 +476,75 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
|
||||
* SSN association set here
|
||||
*/
|
||||
can_collapse = (size_goal - skb->len > 0) &&
|
||||
mptcp_skb_can_collapse_to(msk, skb, mpext);
|
||||
mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
|
||||
if (!can_collapse)
|
||||
TCP_SKB_CB(skb)->eor = 1;
|
||||
else
|
||||
avail_size = size_goal - skb->len;
|
||||
}
|
||||
|
||||
/* reuse tail pfrag, if possible, or carve a new one from the page
|
||||
* allocator
|
||||
*/
|
||||
dfrag = mptcp_rtx_tail(sk);
|
||||
offset = pfrag->offset;
|
||||
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
|
||||
if (!dfrag_collapsed) {
|
||||
dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
|
||||
if (!retransmission) {
|
||||
/* reuse tail pfrag, if possible, or carve a new one from the
|
||||
* page allocator
|
||||
*/
|
||||
dfrag = mptcp_rtx_tail(sk);
|
||||
offset = pfrag->offset;
|
||||
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
|
||||
if (!dfrag_collapsed) {
|
||||
dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
|
||||
offset = dfrag->offset;
|
||||
frag_truesize = dfrag->overhead;
|
||||
}
|
||||
psize = min_t(size_t, pfrag->size - offset, avail_size);
|
||||
|
||||
/* Copy to page */
|
||||
pr_debug("left=%zu", msg_data_left(msg));
|
||||
psize = copy_page_from_iter(pfrag->page, offset,
|
||||
min_t(size_t, msg_data_left(msg),
|
||||
psize),
|
||||
&msg->msg_iter);
|
||||
pr_debug("left=%zu", msg_data_left(msg));
|
||||
if (!psize)
|
||||
return -EINVAL;
|
||||
|
||||
if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
|
||||
return -ENOMEM;
|
||||
} else {
|
||||
offset = dfrag->offset;
|
||||
frag_truesize = dfrag->overhead;
|
||||
psize = min_t(size_t, dfrag->data_len, avail_size);
|
||||
}
|
||||
psize = min_t(size_t, pfrag->size - offset, avail_size);
|
||||
|
||||
/* Copy to page */
|
||||
pr_debug("left=%zu", msg_data_left(msg));
|
||||
psize = copy_page_from_iter(pfrag->page, offset,
|
||||
min_t(size_t, msg_data_left(msg), psize),
|
||||
&msg->msg_iter);
|
||||
pr_debug("left=%zu", msg_data_left(msg));
|
||||
if (!psize)
|
||||
return -EINVAL;
|
||||
|
||||
if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
|
||||
return -ENOMEM;
|
||||
|
||||
/* tell the TCP stack to delay the push so that we can safely
|
||||
* access the skb after the sendpages call
|
||||
*/
|
||||
ret = do_tcp_sendpages(ssk, pfrag->page, offset, psize,
|
||||
ret = do_tcp_sendpages(ssk, page, offset, psize,
|
||||
msg->msg_flags | MSG_SENDPAGE_NOTLAST);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
|
||||
frag_truesize += ret;
|
||||
if (unlikely(ret < psize))
|
||||
iov_iter_revert(&msg->msg_iter, psize - ret);
|
||||
if (!retransmission) {
|
||||
if (unlikely(ret < psize))
|
||||
iov_iter_revert(&msg->msg_iter, psize - ret);
|
||||
|
||||
/* send successful, keep track of sent data for mptcp-level
|
||||
* retransmission
|
||||
*/
|
||||
dfrag->data_len += ret;
|
||||
if (!dfrag_collapsed) {
|
||||
get_page(dfrag->page);
|
||||
list_add_tail(&dfrag->list, &msk->rtx_queue);
|
||||
/* send successful, keep track of sent data for mptcp-level
|
||||
* retransmission
|
||||
*/
|
||||
dfrag->data_len += ret;
|
||||
if (!dfrag_collapsed) {
|
||||
get_page(dfrag->page);
|
||||
list_add_tail(&dfrag->list, &msk->rtx_queue);
|
||||
sk_wmem_queued_add(sk, frag_truesize);
|
||||
} else {
|
||||
sk_wmem_queued_add(sk, ret);
|
||||
}
|
||||
|
||||
/* charge data on mptcp rtx queue to the master socket
|
||||
* Note: we charge such data both to sk and ssk
|
||||
*/
|
||||
sk->sk_forward_alloc -= frag_truesize;
|
||||
}
|
||||
|
||||
/* charge data on mptcp rtx queue to the master socket
|
||||
* Note: we charge such data both to sk and ssk
|
||||
*/
|
||||
sk->sk_forward_alloc -= frag_truesize;
|
||||
|
||||
/* if the tail skb extension is still the cached one, collapsing
|
||||
* really happened. Note: we can't check for 'same skb' as the sk_buff
|
||||
* hdr on tail can be transmitted, freed and re-allocated by the
|
||||
@ -539,7 +562,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
|
||||
msk->cached_ext = NULL;
|
||||
|
||||
memset(mpext, 0, sizeof(*mpext));
|
||||
mpext->data_seq = msk->write_seq;
|
||||
mpext->data_seq = *write_seq;
|
||||
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
|
||||
mpext->data_len = ret;
|
||||
mpext->use_map = 1;
|
||||
@ -550,8 +573,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
|
||||
mpext->dsn64);
|
||||
|
||||
out:
|
||||
pfrag->offset += frag_truesize;
|
||||
msk->write_seq += ret;
|
||||
if (!retransmission)
|
||||
pfrag->offset += frag_truesize;
|
||||
*write_seq += ret;
|
||||
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
|
||||
|
||||
return ret;
|
||||
@ -663,7 +687,7 @@ fallback:
|
||||
|
||||
lock_sock(ssk);
|
||||
while (msg_data_left(msg)) {
|
||||
ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now,
|
||||
ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
|
||||
&size_goal);
|
||||
if (ret < 0)
|
||||
break;
|
||||
@ -974,6 +998,7 @@ static int mptcp_init_sock(struct sock *sk)
|
||||
return ret;
|
||||
|
||||
sk_sockets_allocated_inc(sk);
|
||||
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
|
||||
|
||||
if (!mptcp_is_enabled(sock_net(sk)))
|
||||
return -ENOPROTOOPT;
|
||||
|
Loading…
Reference in New Issue
Block a user