2019-05-19 12:08:55 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2014-02-06 00:11:33 +00:00
|
|
|
#include <linux/export.h>
|
2016-11-01 13:40:13 +00:00
|
|
|
#include <linux/bvec.h>
|
2020-10-16 03:13:50 +00:00
|
|
|
#include <linux/fault-inject-usercopy.h>
|
2014-02-06 00:11:33 +00:00
|
|
|
#include <linux/uio.h>
|
|
|
|
#include <linux/pagemap.h>
|
2021-05-05 01:40:03 +00:00
|
|
|
#include <linux/highmem.h>
|
2014-03-21 08:58:33 +00:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/vmalloc.h>
|
2016-09-22 20:33:12 +00:00
|
|
|
#include <linux/splice.h>
|
2020-09-25 04:51:40 +00:00
|
|
|
#include <linux/compat.h>
|
2018-12-04 01:52:09 +00:00
|
|
|
#include <linux/scatterlist.h>
|
2020-01-21 16:05:11 +00:00
|
|
|
#include <linux/instrumented.h>
|
2023-09-25 12:03:04 +00:00
|
|
|
#include <linux/iov_iter.h>
|
2014-02-06 00:11:33 +00:00
|
|
|
|
2023-09-25 12:03:04 +00:00
|
|
|
static __always_inline
|
|
|
|
size_t copy_to_user_iter(void __user *iter_to, size_t progress,
|
|
|
|
size_t len, void *from, void *priv2)
|
2017-06-30 02:25:14 +00:00
|
|
|
{
|
2020-10-16 03:13:50 +00:00
|
|
|
if (should_fail_usercopy())
|
2023-09-25 12:03:04 +00:00
|
|
|
return len;
|
|
|
|
if (access_ok(iter_to, len)) {
|
|
|
|
from += progress;
|
|
|
|
instrument_copy_to_user(iter_to, from, len);
|
|
|
|
len = raw_copy_to_user(iter_to, from, len);
|
2017-06-30 02:25:14 +00:00
|
|
|
}
|
2023-09-25 12:03:04 +00:00
|
|
|
return len;
|
2017-06-30 02:25:14 +00:00
|
|
|
}
|
|
|
|
|
2023-09-25 12:03:04 +00:00
|
|
|
static __always_inline
|
|
|
|
size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
|
|
|
|
size_t len, void *from, void *priv2)
|
2023-03-22 18:57:03 +00:00
|
|
|
{
|
2023-09-25 12:03:04 +00:00
|
|
|
ssize_t res;
|
2023-03-22 18:57:03 +00:00
|
|
|
|
|
|
|
if (should_fail_usercopy())
|
2023-09-25 12:03:04 +00:00
|
|
|
return len;
|
2023-03-22 18:57:03 +00:00
|
|
|
|
2023-09-25 12:03:04 +00:00
|
|
|
from += progress;
|
|
|
|
res = copy_to_user_nofault(iter_to, from, len);
|
|
|
|
return res < 0 ? len : res;
|
2023-03-22 18:57:03 +00:00
|
|
|
}
|
|
|
|
|
2023-09-25 12:03:04 +00:00
|
|
|
static __always_inline
|
|
|
|
size_t copy_from_user_iter(void __user *iter_from, size_t progress,
|
|
|
|
size_t len, void *to, void *priv2)
|
2017-06-30 02:25:14 +00:00
|
|
|
{
|
2023-09-25 12:03:04 +00:00
|
|
|
size_t res = len;
|
2022-09-15 15:03:37 +00:00
|
|
|
|
2020-10-16 03:13:50 +00:00
|
|
|
if (should_fail_usercopy())
|
2023-09-25 12:03:04 +00:00
|
|
|
return len;
|
|
|
|
if (access_ok(iter_from, len)) {
|
|
|
|
to += progress;
|
|
|
|
instrument_copy_from_user_before(to, iter_from, len);
|
|
|
|
res = raw_copy_from_user(to, iter_from, len);
|
|
|
|
instrument_copy_from_user_after(to, iter_from, len, res);
|
2017-06-30 02:25:14 +00:00
|
|
|
}
|
2022-09-15 15:03:37 +00:00
|
|
|
return res;
|
2017-06-30 02:25:14 +00:00
|
|
|
}
|
|
|
|
|
2023-09-25 12:03:04 +00:00
|
|
|
static __always_inline
|
|
|
|
size_t memcpy_to_iter(void *iter_to, size_t progress,
|
|
|
|
size_t len, void *from, void *priv2)
|
|
|
|
{
|
|
|
|
memcpy(iter_to, from + progress, len);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline
|
|
|
|
size_t memcpy_from_iter(void *iter_from, size_t progress,
|
|
|
|
size_t len, void *to, void *priv2)
|
|
|
|
{
|
|
|
|
memcpy(to + progress, iter_from, len);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-03-11 14:43:31 +00:00
|
|
|
/*
|
2021-08-02 12:54:16 +00:00
|
|
|
* fault_in_iov_iter_readable - fault in iov iterator for reading
|
|
|
|
* @i: iterator
|
|
|
|
* @size: maximum length
|
|
|
|
*
|
2015-03-11 14:43:31 +00:00
|
|
|
* Fault in one or more iovecs of the given iov_iter, to a maximum length of
|
2021-08-02 12:54:16 +00:00
|
|
|
* @size. For each iovec, fault in each page that constitutes the iovec.
|
|
|
|
*
|
|
|
|
* Returns the number of bytes not faulted in (like copy_to_user() and
|
|
|
|
* copy_from_user()).
|
2015-03-11 14:43:31 +00:00
|
|
|
*
|
2021-08-02 12:54:16 +00:00
|
|
|
* Always returns 0 for non-userspace iterators.
|
2015-03-11 14:43:31 +00:00
|
|
|
*/
|
2021-08-02 12:54:16 +00:00
|
|
|
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
|
2015-03-11 14:43:31 +00:00
|
|
|
{
|
2022-05-22 18:59:25 +00:00
|
|
|
if (iter_is_ubuf(i)) {
|
|
|
|
size_t n = min(size, iov_iter_count(i));
|
|
|
|
n -= fault_in_readable(i->ubuf + i->iov_offset, n);
|
|
|
|
return size - n;
|
|
|
|
} else if (iter_is_iovec(i)) {
|
2021-08-02 12:54:16 +00:00
|
|
|
size_t count = min(size, iov_iter_count(i));
|
2021-05-02 15:57:37 +00:00
|
|
|
const struct iovec *p;
|
|
|
|
size_t skip;
|
|
|
|
|
2021-08-02 12:54:16 +00:00
|
|
|
size -= count;
|
2023-03-29 14:52:15 +00:00
|
|
|
for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
|
2021-08-02 12:54:16 +00:00
|
|
|
size_t len = min(count, p->iov_len - skip);
|
|
|
|
size_t ret;
|
2021-05-02 15:57:37 +00:00
|
|
|
|
|
|
|
if (unlikely(!len))
|
|
|
|
continue;
|
2021-08-02 12:54:16 +00:00
|
|
|
ret = fault_in_readable(p->iov_base + skip, len);
|
|
|
|
count -= len - ret;
|
|
|
|
if (ret)
|
|
|
|
break;
|
2021-05-02 15:57:37 +00:00
|
|
|
}
|
2021-08-02 12:54:16 +00:00
|
|
|
return count + size;
|
2015-03-11 14:43:31 +00:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2021-08-02 12:54:16 +00:00
|
|
|
EXPORT_SYMBOL(fault_in_iov_iter_readable);
|
2015-03-11 14:43:31 +00:00
|
|
|
|
2021-07-05 15:26:28 +00:00
|
|
|
/*
|
|
|
|
* fault_in_iov_iter_writeable - fault in iov iterator for writing
|
|
|
|
* @i: iterator
|
|
|
|
* @size: maximum length
|
|
|
|
*
|
|
|
|
* Faults in the iterator using get_user_pages(), i.e., without triggering
|
|
|
|
* hardware page faults. This is primarily useful when we already know that
|
|
|
|
* some or all of the pages in @i aren't in memory.
|
|
|
|
*
|
|
|
|
* Returns the number of bytes not faulted in, like copy_to_user() and
|
|
|
|
* copy_from_user().
|
|
|
|
*
|
|
|
|
* Always returns 0 for non-user-space iterators.
|
|
|
|
*/
|
|
|
|
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
|
|
|
|
{
|
2022-05-22 18:59:25 +00:00
|
|
|
if (iter_is_ubuf(i)) {
|
|
|
|
size_t n = min(size, iov_iter_count(i));
|
|
|
|
n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
|
|
|
|
return size - n;
|
|
|
|
} else if (iter_is_iovec(i)) {
|
2021-07-05 15:26:28 +00:00
|
|
|
size_t count = min(size, iov_iter_count(i));
|
|
|
|
const struct iovec *p;
|
|
|
|
size_t skip;
|
|
|
|
|
|
|
|
size -= count;
|
2023-03-29 14:52:15 +00:00
|
|
|
for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
|
2021-07-05 15:26:28 +00:00
|
|
|
size_t len = min(count, p->iov_len - skip);
|
|
|
|
size_t ret;
|
|
|
|
|
|
|
|
if (unlikely(!len))
|
|
|
|
continue;
|
|
|
|
ret = fault_in_safe_writeable(p->iov_base + skip, len);
|
|
|
|
count -= len - ret;
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return count + size;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(fault_in_iov_iter_writeable);
|
|
|
|
|
2018-10-19 23:57:56 +00:00
|
|
|
void iov_iter_init(struct iov_iter *i, unsigned int direction,
|
2014-03-06 00:28:09 +00:00
|
|
|
const struct iovec *iov, unsigned long nr_segs,
|
|
|
|
size_t count)
|
|
|
|
{
|
2018-10-19 23:57:56 +00:00
|
|
|
WARN_ON(direction & ~(READ | WRITE));
|
2021-04-22 18:50:39 +00:00
|
|
|
*i = (struct iov_iter) {
|
|
|
|
.iter_type = ITER_IOVEC,
|
2021-07-12 10:06:14 +00:00
|
|
|
.nofault = false,
|
2021-04-22 18:50:39 +00:00
|
|
|
.data_source = direction,
|
2023-03-29 14:52:15 +00:00
|
|
|
.__iov = iov,
|
2021-04-22 18:50:39 +00:00
|
|
|
.nr_segs = nr_segs,
|
|
|
|
.iov_offset = 0,
|
|
|
|
.count = count
|
|
|
|
};
|
2014-03-06 00:28:09 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_init);
|
2014-03-15 08:05:57 +00:00
|
|
|
|
2017-06-30 01:45:10 +00:00
|
|
|
size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
|
2014-04-05 03:12:29 +00:00
|
|
|
{
|
2022-09-16 00:11:15 +00:00
|
|
|
if (WARN_ON_ONCE(i->data_source))
|
|
|
|
return 0;
|
2022-05-22 18:59:25 +00:00
|
|
|
if (user_backed_iter(i))
|
2017-06-30 02:25:14 +00:00
|
|
|
might_fault();
|
2023-09-25 12:03:04 +00:00
|
|
|
return iterate_and_advance(i, bytes, (void *)addr,
|
|
|
|
copy_to_user_iter, memcpy_to_iter);
|
2014-08-01 13:27:22 +00:00
|
|
|
}
|
2017-06-30 01:45:10 +00:00
|
|
|
EXPORT_SYMBOL(_copy_to_iter);
|
2014-08-01 13:27:22 +00:00
|
|
|
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 03:40:16 +00:00
|
|
|
#ifdef CONFIG_ARCH_HAS_COPY_MC
|
2023-09-25 12:03:04 +00:00
|
|
|
static __always_inline
|
|
|
|
size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
|
|
|
|
size_t len, void *from, void *priv2)
|
2018-05-04 00:06:31 +00:00
|
|
|
{
|
2023-09-25 12:03:04 +00:00
|
|
|
if (access_ok(iter_to, len)) {
|
|
|
|
from += progress;
|
|
|
|
instrument_copy_to_user(iter_to, from, len);
|
|
|
|
len = copy_mc_to_user(iter_to, from, len);
|
2018-05-04 00:06:31 +00:00
|
|
|
}
|
2023-09-25 12:03:04 +00:00
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline
|
|
|
|
size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
|
|
|
|
size_t len, void *from, void *priv2)
|
|
|
|
{
|
|
|
|
return copy_mc_to_kernel(iter_to, from + progress, len);
|
2018-05-04 00:06:31 +00:00
|
|
|
}
|
|
|
|
|
2018-07-08 20:46:02 +00:00
|
|
|
/**
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 03:40:16 +00:00
|
|
|
* _copy_mc_to_iter - copy to iter with source memory error exception handling
|
2018-07-08 20:46:02 +00:00
|
|
|
* @addr: source kernel address
|
|
|
|
* @bytes: total transfer length
|
2021-09-08 02:58:54 +00:00
|
|
|
* @i: destination iterator
|
2018-07-08 20:46:02 +00:00
|
|
|
*
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 03:40:16 +00:00
|
|
|
* The pmem driver deploys this for the dax operation
|
|
|
|
* (dax_copy_to_iter()) for dax reads (bypass page-cache and the
|
|
|
|
* block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
|
|
|
|
* successfully copied.
|
2018-07-08 20:46:02 +00:00
|
|
|
*
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 03:40:16 +00:00
|
|
|
* The main differences between this and typical _copy_to_iter().
|
2018-07-08 20:46:02 +00:00
|
|
|
*
|
|
|
|
* * Typical tail/residue handling after a fault retries the copy
|
|
|
|
* byte-by-byte until the fault happens again. Re-triggering machine
|
|
|
|
* checks is potentially fatal so the implementation uses source
|
|
|
|
* alignment and poison alignment assumptions to avoid re-triggering
|
|
|
|
* hardware exceptions.
|
|
|
|
*
|
2023-05-22 13:50:17 +00:00
|
|
|
* * ITER_KVEC and ITER_BVEC can return short copies. Compare to
|
|
|
|
* copy_to_iter() where only ITER_IOVEC attempts might return a short copy.
|
2021-09-08 02:58:54 +00:00
|
|
|
*
|
|
|
|
* Return: number of bytes copied (may be %0)
|
2018-07-08 20:46:02 +00:00
|
|
|
*/
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 03:40:16 +00:00
|
|
|
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
|
2018-05-04 00:06:31 +00:00
|
|
|
{
|
2022-09-16 00:11:15 +00:00
|
|
|
if (WARN_ON_ONCE(i->data_source))
|
|
|
|
return 0;
|
2022-05-22 18:59:25 +00:00
|
|
|
if (user_backed_iter(i))
|
2018-05-04 00:06:31 +00:00
|
|
|
might_fault();
|
2023-09-25 12:03:04 +00:00
|
|
|
return iterate_and_advance(i, bytes, (void *)addr,
|
|
|
|
copy_to_user_iter_mc, memcpy_to_iter_mc);
|
2018-05-04 00:06:31 +00:00
|
|
|
}
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 03:40:16 +00:00
|
|
|
EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
|
|
|
|
#endif /* CONFIG_ARCH_HAS_COPY_MC */
|
2018-05-04 00:06:31 +00:00
|
|
|
|
2023-09-25 12:03:05 +00:00
|
|
|
static __always_inline
|
|
|
|
size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return iterate_and_advance(i, bytes, addr,
|
|
|
|
copy_from_user_iter, memcpy_from_iter);
|
mm: hwpoison: coredump: support recovery from dump_user_range()
dump_user_range() is used to copy the user page to a coredump file, but if
a hardware memory error occurred during copy, which called from
__kernel_write_iter() in dump_user_range(), it crashes,
CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
pc : __memcpy+0x110/0x260
lr : _copy_from_iter+0x3bc/0x4c8
...
Call trace:
__memcpy+0x110/0x260
copy_page_from_iter+0xcc/0x130
pipe_write+0x164/0x6d8
__kernel_write_iter+0x9c/0x210
dump_user_range+0xc8/0x1d8
elf_core_dump+0x308/0x368
do_coredump+0x2e8/0xa40
get_signal+0x59c/0x788
do_signal+0x118/0x1f8
do_notify_resume+0xf0/0x280
el0_da+0x130/0x138
el0t_64_sync_handler+0x68/0xc0
el0t_64_sync+0x188/0x190
Generally, the '->write_iter' of file ops will use copy_page_from_iter()
and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
in both of them to handle #MC during source read, which stop coredump
processing and kill the task instead of kernel panic, but the source
address may not always a user address, so introduce a new copy_mc flag in
struct iov_iter{} to indicate that the iter could do a safe memory copy,
also introduce the helpers to set/cleck the flag, for now, it's only used
in coredump's dump_user_range(), but it could expand to any other
scenarios to fix the similar issue.
Link: https://lkml.kernel.org/r/20230417045323.11054-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Tong Tiangen <tongtiangen@huawei.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-17 04:53:23 +00:00
|
|
|
}
|
|
|
|
|
2017-06-30 01:45:10 +00:00
|
|
|
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
|
2014-08-01 13:27:22 +00:00
|
|
|
{
|
2022-09-16 00:11:15 +00:00
|
|
|
if (WARN_ON_ONCE(!i->data_source))
|
2016-09-22 20:33:12 +00:00
|
|
|
return 0;
|
2022-09-16 00:11:15 +00:00
|
|
|
|
2022-05-22 18:59:25 +00:00
|
|
|
if (user_backed_iter(i))
|
2017-06-30 02:25:14 +00:00
|
|
|
might_fault();
|
2023-09-25 12:03:05 +00:00
|
|
|
return __copy_from_iter(addr, bytes, i);
|
2014-08-01 13:27:22 +00:00
|
|
|
}
|
2017-06-30 01:45:10 +00:00
|
|
|
EXPORT_SYMBOL(_copy_from_iter);
|
2014-08-01 13:27:22 +00:00
|
|
|
|
2023-09-25 12:03:04 +00:00
|
|
|
static __always_inline
|
|
|
|
size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
|
|
|
|
size_t len, void *to, void *priv2)
|
|
|
|
{
|
|
|
|
return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
|
|
|
|
}
|
|
|
|
|
2017-06-30 01:45:10 +00:00
|
|
|
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
|
2014-11-28 01:27:08 +00:00
|
|
|
{
|
2022-09-16 00:11:15 +00:00
|
|
|
if (WARN_ON_ONCE(!i->data_source))
|
2016-09-22 20:33:12 +00:00
|
|
|
return 0;
|
2022-09-16 00:11:15 +00:00
|
|
|
|
2023-09-25 12:03:04 +00:00
|
|
|
return iterate_and_advance(i, bytes, addr,
|
|
|
|
copy_from_user_iter_nocache,
|
|
|
|
memcpy_from_iter);
|
2014-11-28 01:27:08 +00:00
|
|
|
}
|
2017-06-30 01:45:10 +00:00
|
|
|
EXPORT_SYMBOL(_copy_from_iter_nocache);
|
2014-11-28 01:27:08 +00:00
|
|
|
|
2017-05-29 19:22:50 +00:00
|
|
|
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
|
2023-09-25 12:03:04 +00:00
|
|
|
static __always_inline
|
|
|
|
size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
|
|
|
|
size_t len, void *to, void *priv2)
|
|
|
|
{
|
|
|
|
return __copy_from_user_flushcache(to + progress, iter_from, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline
|
|
|
|
size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
|
|
|
|
size_t len, void *to, void *priv2)
|
|
|
|
{
|
|
|
|
memcpy_flushcache(to + progress, iter_from, len);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-08 20:46:07 +00:00
|
|
|
/**
|
|
|
|
* _copy_from_iter_flushcache - write destination through cpu cache
|
|
|
|
* @addr: destination kernel address
|
|
|
|
* @bytes: total transfer length
|
2021-09-08 02:58:54 +00:00
|
|
|
* @i: source iterator
|
2018-07-08 20:46:07 +00:00
|
|
|
*
|
|
|
|
* The pmem driver arranges for filesystem-dax to use this facility via
|
|
|
|
* dax_copy_from_iter() for ensuring that writes to persistent memory
|
|
|
|
* are flushed through the CPU cache. It is differentiated from
|
|
|
|
* _copy_from_iter_nocache() in that guarantees all data is flushed for
|
|
|
|
* all iterator types. The _copy_from_iter_nocache() only attempts to
|
|
|
|
* bypass the cache for the ITER_IOVEC case, and on some archs may use
|
|
|
|
* instructions that strand dirty-data in the cache.
|
2021-09-08 02:58:54 +00:00
|
|
|
*
|
|
|
|
* Return: number of bytes copied (may be %0)
|
2018-07-08 20:46:07 +00:00
|
|
|
*/
|
Merge branch 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull iov_iter hardening from Al Viro:
"This is the iov_iter/uaccess/hardening pile.
For one thing, it trims the inline part of copy_to_user/copy_from_user
to the minimum that *does* need to be inlined - object size checks,
basically. For another, it sanitizes the checks for iov_iter
primitives. There are 4 groups of checks: access_ok(), might_fault(),
object size and KASAN.
- access_ok() had been verified by whoever had set the iov_iter up.
However, that has happened in a function far away, so proving that
there's no path to actual copying bypassing those checks is hard
and proving that iov_iter has not been buggered in the meanwhile is
also not pleasant. So we want those redone in actual
copyin/copyout.
- might_fault() is better off consolidated - we know whether it needs
to be checked as soon as we enter iov_iter primitive and observe
the iov_iter flavour. No need to wait until the copyin/copyout. The
call chains are short enough to make sure we won't miss anything -
in fact, it's more robust that way, since there are cases where we
do e.g. forced fault-in before getting to copyin/copyout. It's not
quite what we need to check (in particular, combination of
iovec-backed and set_fs(KERNEL_DS) is almost certainly a bug, not a
cause to skip checks), but that's for later series. For now let's
keep might_fault().
- KASAN checks belong in copyin/copyout - at the same level where
other iov_iter flavours would've hit them in memcpy().
- object size checks should apply to *all* iov_iter flavours, not
just iovec-backed ones.
There are two groups of primitives - one gets the kernel object
described as pointer + size (copy_to_iter(), etc.) while another gets
it as page + offset + size (copy_page_to_iter(), etc.)
For the first group the checks are best done where we actually have a
chance to find the object size. In other words, those belong in inline
wrappers in uio.h, before calling into iov_iter.c. Same kind as we
have for inlined part of copy_to_user().
For the second group there is no object to look at - offset in page is
just a number, it bears no type information. So we do them in the
common helper called by iov_iter.c primitives of that kind. All it
currently does is checking that we are not trying to access outside of
the compound page; eventually we might want to add some sanity checks
on the page involved.
So the things we need in copyin/copyout part of iov_iter.c do not
quite match anything in uaccess.h (we want no zeroing, we *do* want
access_ok() and KASAN and we want no might_fault() or object size
checks done on that level). OTOH, these needs are simple enough to
provide a couple of helpers (static in iov_iter.c) doing just what we
need..."
* 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
iov_iter: saner checks on copyin/copyout
iov_iter: sanity checks for copy to/from page primitives
iov_iter/hardening: move object size checks to inlined part
copy_{to,from}_user(): consolidate object size checks
copy_{from,to}_user(): move kasan checks and might_fault() out-of-line
2017-07-08 03:39:20 +00:00
|
|
|
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
|
2017-05-29 19:22:50 +00:00
|
|
|
{
|
2022-09-16 00:11:15 +00:00
|
|
|
if (WARN_ON_ONCE(!i->data_source))
|
2017-05-29 19:22:50 +00:00
|
|
|
return 0;
|
2022-09-16 00:11:15 +00:00
|
|
|
|
2023-09-25 12:03:04 +00:00
|
|
|
return iterate_and_advance(i, bytes, addr,
|
|
|
|
copy_from_user_iter_flushcache,
|
|
|
|
memcpy_from_iter_flushcache);
|
2017-05-29 19:22:50 +00:00
|
|
|
}
|
Merge branch 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull iov_iter hardening from Al Viro:
"This is the iov_iter/uaccess/hardening pile.
For one thing, it trims the inline part of copy_to_user/copy_from_user
to the minimum that *does* need to be inlined - object size checks,
basically. For another, it sanitizes the checks for iov_iter
primitives. There are 4 groups of checks: access_ok(), might_fault(),
object size and KASAN.
- access_ok() had been verified by whoever had set the iov_iter up.
However, that has happened in a function far away, so proving that
there's no path to actual copying bypassing those checks is hard
and proving that iov_iter has not been buggered in the meanwhile is
also not pleasant. So we want those redone in actual
copyin/copyout.
- might_fault() is better off consolidated - we know whether it needs
to be checked as soon as we enter iov_iter primitive and observe
the iov_iter flavour. No need to wait until the copyin/copyout. The
call chains are short enough to make sure we won't miss anything -
in fact, it's more robust that way, since there are cases where we
do e.g. forced fault-in before getting to copyin/copyout. It's not
quite what we need to check (in particular, combination of
iovec-backed and set_fs(KERNEL_DS) is almost certainly a bug, not a
cause to skip checks), but that's for later series. For now let's
keep might_fault().
- KASAN checks belong in copyin/copyout - at the same level where
other iov_iter flavours would've hit them in memcpy().
- object size checks should apply to *all* iov_iter flavours, not
just iovec-backed ones.
There are two groups of primitives - one gets the kernel object
described as pointer + size (copy_to_iter(), etc.) while another gets
it as page + offset + size (copy_page_to_iter(), etc.)
For the first group the checks are best done where we actually have a
chance to find the object size. In other words, those belong in inline
wrappers in uio.h, before calling into iov_iter.c. Same kind as we
have for inlined part of copy_to_user().
For the second group there is no object to look at - offset in page is
just a number, it bears no type information. So we do them in the
common helper called by iov_iter.c primitives of that kind. All it
currently does is checking that we are not trying to access outside of
the compound page; eventually we might want to add some sanity checks
on the page involved.
So the things we need in copyin/copyout part of iov_iter.c do not
quite match anything in uaccess.h (we want no zeroing, we *do* want
access_ok() and KASAN and we want no might_fault() or object size
checks done on that level). OTOH, these needs are simple enough to
provide a couple of helpers (static in iov_iter.c) doing just what we
need..."
* 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
iov_iter: saner checks on copyin/copyout
iov_iter: sanity checks for copy to/from page primitives
iov_iter/hardening: move object size checks to inlined part
copy_{to,from}_user(): consolidate object size checks
copy_{from,to}_user(): move kasan checks and might_fault() out-of-line
2017-07-08 03:39:20 +00:00
|
|
|
EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
|
2017-05-29 19:22:50 +00:00
|
|
|
#endif
|
|
|
|
|
2017-06-30 01:52:57 +00:00
|
|
|
static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
|
|
|
|
{
|
2019-02-26 18:42:39 +00:00
|
|
|
struct page *head;
|
|
|
|
size_t v = n + offset;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The general case needs to access the page order in order
|
|
|
|
* to compute the page size.
|
|
|
|
* However, we mostly deal with order-0 pages and thus can
|
|
|
|
* avoid a possible cache line miss for requests that fit all
|
|
|
|
* page orders.
|
|
|
|
*/
|
|
|
|
if (n <= v && v <= PAGE_SIZE)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
head = compound_head(page);
|
|
|
|
v += (page - head) << PAGE_SHIFT;
|
2017-08-29 18:20:32 +00:00
|
|
|
|
2022-07-29 17:01:57 +00:00
|
|
|
if (WARN_ON(n > v || v > page_size(head)))
|
|
|
|
return false;
|
|
|
|
return true;
|
2017-06-30 01:52:57 +00:00
|
|
|
}
|
2016-11-02 02:09:04 +00:00
|
|
|
|
2021-04-30 00:42:25 +00:00
|
|
|
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
|
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
|
|
|
size_t res = 0;
|
2022-07-29 17:01:57 +00:00
|
|
|
if (!page_copy_sane(page, offset, bytes))
|
2021-04-30 00:42:25 +00:00
|
|
|
return 0;
|
2022-09-16 00:11:15 +00:00
|
|
|
if (WARN_ON_ONCE(i->data_source))
|
|
|
|
return 0;
|
2021-04-30 00:42:25 +00:00
|
|
|
page += offset / PAGE_SIZE; // first subpage
|
|
|
|
offset %= PAGE_SIZE;
|
|
|
|
while (1) {
|
2022-06-23 21:21:37 +00:00
|
|
|
void *kaddr = kmap_local_page(page);
|
|
|
|
size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
|
|
|
|
n = _copy_to_iter(kaddr + offset, n, i);
|
|
|
|
kunmap_local(kaddr);
|
2021-04-30 00:42:25 +00:00
|
|
|
res += n;
|
|
|
|
bytes -= n;
|
|
|
|
if (!bytes || !n)
|
|
|
|
break;
|
|
|
|
offset += n;
|
|
|
|
if (offset == PAGE_SIZE) {
|
|
|
|
page++;
|
|
|
|
offset = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
2014-04-05 03:12:29 +00:00
|
|
|
EXPORT_SYMBOL(copy_page_to_iter);
|
|
|
|
|
2023-03-22 18:57:03 +00:00
|
|
|
size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
|
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
|
|
|
size_t res = 0;
|
|
|
|
|
|
|
|
if (!page_copy_sane(page, offset, bytes))
|
|
|
|
return 0;
|
|
|
|
if (WARN_ON_ONCE(i->data_source))
|
|
|
|
return 0;
|
|
|
|
page += offset / PAGE_SIZE; // first subpage
|
|
|
|
offset %= PAGE_SIZE;
|
|
|
|
while (1) {
|
|
|
|
void *kaddr = kmap_local_page(page);
|
|
|
|
size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
|
|
|
|
|
2023-11-17 21:38:46 +00:00
|
|
|
n = iterate_and_advance(i, n, kaddr + offset,
|
2023-09-25 12:03:04 +00:00
|
|
|
copy_to_user_iter_nofault,
|
|
|
|
memcpy_to_iter);
|
2023-03-22 18:57:03 +00:00
|
|
|
kunmap_local(kaddr);
|
|
|
|
res += n;
|
|
|
|
bytes -= n;
|
|
|
|
if (!bytes || !n)
|
|
|
|
break;
|
|
|
|
offset += n;
|
|
|
|
if (offset == PAGE_SIZE) {
|
|
|
|
page++;
|
|
|
|
offset = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(copy_page_to_iter_nofault);
|
|
|
|
|
2014-04-05 03:12:29 +00:00
|
|
|
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
|
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
2022-07-29 16:54:53 +00:00
|
|
|
size_t res = 0;
|
|
|
|
if (!page_copy_sane(page, offset, bytes))
|
|
|
|
return 0;
|
|
|
|
page += offset / PAGE_SIZE; // first subpage
|
|
|
|
offset %= PAGE_SIZE;
|
|
|
|
while (1) {
|
2021-04-27 16:33:24 +00:00
|
|
|
void *kaddr = kmap_local_page(page);
|
2022-07-29 16:54:53 +00:00
|
|
|
size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
|
|
|
|
n = _copy_from_iter(kaddr + offset, n, i);
|
2021-04-27 16:33:24 +00:00
|
|
|
kunmap_local(kaddr);
|
2022-07-29 16:54:53 +00:00
|
|
|
res += n;
|
|
|
|
bytes -= n;
|
|
|
|
if (!bytes || !n)
|
|
|
|
break;
|
|
|
|
offset += n;
|
|
|
|
if (offset == PAGE_SIZE) {
|
|
|
|
page++;
|
|
|
|
offset = 0;
|
|
|
|
}
|
2021-06-02 21:25:59 +00:00
|
|
|
}
|
2022-07-29 16:54:53 +00:00
|
|
|
return res;
|
2014-04-05 03:12:29 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(copy_page_from_iter);
|
|
|
|
|
2023-09-25 12:03:04 +00:00
|
|
|
static __always_inline
|
|
|
|
size_t zero_to_user_iter(void __user *iter_to, size_t progress,
|
|
|
|
size_t len, void *priv, void *priv2)
|
2014-08-01 13:27:22 +00:00
|
|
|
{
|
2023-09-25 12:03:04 +00:00
|
|
|
return clear_user(iter_to, len);
|
|
|
|
}
|
2014-11-27 19:18:54 +00:00
|
|
|
|
2023-09-25 12:03:04 +00:00
|
|
|
static __always_inline
|
|
|
|
size_t zero_to_iter(void *iter_to, size_t progress,
|
|
|
|
size_t len, void *priv, void *priv2)
|
|
|
|
{
|
|
|
|
memset(iter_to, 0, len);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return iterate_and_advance(i, bytes, NULL,
|
|
|
|
zero_to_user_iter, zero_to_iter);
|
2014-08-01 13:27:22 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_zero);
|
|
|
|
|
2023-07-09 22:17:33 +00:00
|
|
|
size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
|
2023-06-06 20:43:32 +00:00
|
|
|
size_t bytes, struct iov_iter *i)
|
2014-04-05 03:12:29 +00:00
|
|
|
{
|
2023-06-06 20:43:32 +00:00
|
|
|
size_t n, copied = 0;
|
2023-07-16 03:03:52 +00:00
|
|
|
|
|
|
|
if (!page_copy_sane(page, offset, bytes))
|
2017-06-30 01:52:57 +00:00
|
|
|
return 0;
|
2023-07-16 03:03:52 +00:00
|
|
|
if (WARN_ON_ONCE(!i->data_source))
|
2016-09-22 20:33:12 +00:00
|
|
|
return 0;
|
2023-07-16 03:03:52 +00:00
|
|
|
|
2023-06-06 20:43:32 +00:00
|
|
|
do {
|
|
|
|
char *p;
|
2023-07-16 03:03:52 +00:00
|
|
|
|
2023-06-06 20:43:32 +00:00
|
|
|
n = bytes - copied;
|
|
|
|
if (PageHighMem(page)) {
|
|
|
|
page += offset / PAGE_SIZE;
|
|
|
|
offset %= PAGE_SIZE;
|
|
|
|
n = min_t(size_t, n, PAGE_SIZE - offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
p = kmap_atomic(page) + offset;
|
2023-09-25 12:03:05 +00:00
|
|
|
n = __copy_from_iter(p, n, i);
|
2023-06-06 20:43:32 +00:00
|
|
|
kunmap_atomic(p);
|
|
|
|
copied += n;
|
|
|
|
offset += n;
|
|
|
|
} while (PageHighMem(page) && copied != bytes && n > 0);
|
|
|
|
|
|
|
|
return copied;
|
2014-04-05 03:12:29 +00:00
|
|
|
}
|
2021-04-30 14:26:41 +00:00
|
|
|
EXPORT_SYMBOL(copy_page_from_iter_atomic);
|
2014-04-05 03:12:29 +00:00
|
|
|
|
2021-01-09 16:03:01 +00:00
|
|
|
static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
|
|
|
|
{
|
2022-06-07 03:44:33 +00:00
|
|
|
const struct bio_vec *bvec, *end;
|
2021-01-09 16:03:01 +00:00
|
|
|
|
2022-06-07 03:44:33 +00:00
|
|
|
if (!i->count)
|
|
|
|
return;
|
|
|
|
i->count -= size;
|
|
|
|
|
|
|
|
size += i->iov_offset;
|
2021-01-09 16:03:01 +00:00
|
|
|
|
2022-06-07 03:44:33 +00:00
|
|
|
for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
|
|
|
|
if (likely(size < bvec->bv_len))
|
|
|
|
break;
|
|
|
|
size -= bvec->bv_len;
|
|
|
|
}
|
|
|
|
i->iov_offset = size;
|
|
|
|
i->nr_segs -= bvec - i->bvec;
|
|
|
|
i->bvec = bvec;
|
2021-01-09 16:03:01 +00:00
|
|
|
}
|
|
|
|
|
2021-04-23 16:58:53 +00:00
|
|
|
static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
|
|
|
|
{
|
|
|
|
const struct iovec *iov, *end;
|
|
|
|
|
|
|
|
if (!i->count)
|
|
|
|
return;
|
|
|
|
i->count -= size;
|
|
|
|
|
|
|
|
size += i->iov_offset; // from beginning of current segment
|
2023-03-29 14:52:15 +00:00
|
|
|
for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
|
2021-04-23 16:58:53 +00:00
|
|
|
if (likely(size < iov->iov_len))
|
|
|
|
break;
|
|
|
|
size -= iov->iov_len;
|
|
|
|
}
|
|
|
|
i->iov_offset = size;
|
2023-03-29 14:52:15 +00:00
|
|
|
i->nr_segs -= iov - iter_iov(i);
|
|
|
|
i->__iov = iov;
|
2021-04-23 16:58:53 +00:00
|
|
|
}
|
|
|
|
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
static void iov_iter_folioq_advance(struct iov_iter *i, size_t size)
|
|
|
|
{
|
|
|
|
const struct folio_queue *folioq = i->folioq;
|
|
|
|
unsigned int slot = i->folioq_slot;
|
|
|
|
|
|
|
|
if (!i->count)
|
|
|
|
return;
|
|
|
|
i->count -= size;
|
|
|
|
|
|
|
|
if (slot >= folioq_nr_slots(folioq)) {
|
|
|
|
folioq = folioq->next;
|
|
|
|
slot = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
size += i->iov_offset; /* From beginning of current segment. */
|
|
|
|
do {
|
|
|
|
size_t fsize = folioq_folio_size(folioq, slot);
|
|
|
|
|
|
|
|
if (likely(size < fsize))
|
|
|
|
break;
|
|
|
|
size -= fsize;
|
|
|
|
slot++;
|
|
|
|
if (slot >= folioq_nr_slots(folioq) && folioq->next) {
|
|
|
|
folioq = folioq->next;
|
|
|
|
slot = 0;
|
|
|
|
}
|
|
|
|
} while (size);
|
|
|
|
|
|
|
|
i->iov_offset = size;
|
|
|
|
i->folioq_slot = slot;
|
|
|
|
i->folioq = folioq;
|
|
|
|
}
|
|
|
|
|
2014-04-05 03:12:29 +00:00
|
|
|
void iov_iter_advance(struct iov_iter *i, size_t size)
|
|
|
|
{
|
2021-04-24 02:24:08 +00:00
|
|
|
if (unlikely(i->count < size))
|
|
|
|
size = i->count;
|
2022-05-22 18:59:25 +00:00
|
|
|
if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
|
|
|
|
i->iov_offset += size;
|
|
|
|
i->count -= size;
|
|
|
|
} else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
|
2021-04-23 16:58:53 +00:00
|
|
|
/* iovec and kvec have identical layouts */
|
|
|
|
iov_iter_iovec_advance(i, size);
|
|
|
|
} else if (iov_iter_is_bvec(i)) {
|
|
|
|
iov_iter_bvec_advance(i, size);
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
} else if (iov_iter_is_folioq(i)) {
|
|
|
|
iov_iter_folioq_advance(i, size);
|
2021-04-23 16:58:53 +00:00
|
|
|
} else if (iov_iter_is_discard(i)) {
|
|
|
|
i->count -= size;
|
2021-01-09 16:03:01 +00:00
|
|
|
}
|
2014-04-05 03:12:29 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_advance);
|
|
|
|
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll)
|
|
|
|
{
|
|
|
|
const struct folio_queue *folioq = i->folioq;
|
|
|
|
unsigned int slot = i->folioq_slot;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
size_t fsize;
|
|
|
|
|
|
|
|
if (slot == 0) {
|
|
|
|
folioq = folioq->prev;
|
|
|
|
slot = folioq_nr_slots(folioq);
|
|
|
|
}
|
|
|
|
slot--;
|
|
|
|
|
|
|
|
fsize = folioq_folio_size(folioq, slot);
|
|
|
|
if (unroll <= fsize) {
|
|
|
|
i->iov_offset = fsize - unroll;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
unroll -= fsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
i->folioq_slot = slot;
|
|
|
|
i->folioq = folioq;
|
|
|
|
}
|
|
|
|
|
2017-02-17 23:42:24 +00:00
|
|
|
void iov_iter_revert(struct iov_iter *i, size_t unroll)
|
|
|
|
{
|
|
|
|
if (!unroll)
|
|
|
|
return;
|
2017-05-08 17:54:47 +00:00
|
|
|
if (WARN_ON(unroll > MAX_RW_COUNT))
|
|
|
|
return;
|
2017-02-17 23:42:24 +00:00
|
|
|
i->count += unroll;
|
2018-10-19 23:57:56 +00:00
|
|
|
if (unlikely(iov_iter_is_discard(i)))
|
|
|
|
return;
|
2017-02-17 23:42:24 +00:00
|
|
|
if (unroll <= i->iov_offset) {
|
|
|
|
i->iov_offset -= unroll;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
unroll -= i->iov_offset;
|
2022-05-22 18:59:25 +00:00
|
|
|
if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
|
2020-02-10 10:00:21 +00:00
|
|
|
BUG(); /* We should never go beyond the start of the specified
|
|
|
|
* range since we might then be straying into pages that
|
|
|
|
* aren't pinned.
|
|
|
|
*/
|
|
|
|
} else if (iov_iter_is_bvec(i)) {
|
2017-02-17 23:42:24 +00:00
|
|
|
const struct bio_vec *bvec = i->bvec;
|
|
|
|
while (1) {
|
|
|
|
size_t n = (--bvec)->bv_len;
|
|
|
|
i->nr_segs++;
|
|
|
|
if (unroll <= n) {
|
|
|
|
i->bvec = bvec;
|
|
|
|
i->iov_offset = n - unroll;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
unroll -= n;
|
|
|
|
}
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
} else if (iov_iter_is_folioq(i)) {
|
|
|
|
i->iov_offset = 0;
|
|
|
|
iov_iter_folioq_revert(i, unroll);
|
2017-02-17 23:42:24 +00:00
|
|
|
} else { /* same logics for iovec and kvec */
|
2023-03-29 14:52:15 +00:00
|
|
|
const struct iovec *iov = iter_iov(i);
|
2017-02-17 23:42:24 +00:00
|
|
|
while (1) {
|
|
|
|
size_t n = (--iov)->iov_len;
|
|
|
|
i->nr_segs++;
|
|
|
|
if (unroll <= n) {
|
2023-03-29 14:52:15 +00:00
|
|
|
i->__iov = iov;
|
2017-02-17 23:42:24 +00:00
|
|
|
i->iov_offset = n - unroll;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
unroll -= n;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_revert);
|
|
|
|
|
2014-04-05 03:12:29 +00:00
|
|
|
/*
|
|
|
|
* Return the count of just the current iov_iter segment.
|
|
|
|
*/
|
|
|
|
size_t iov_iter_single_seg_count(const struct iov_iter *i)
|
|
|
|
{
|
2021-06-02 21:25:59 +00:00
|
|
|
if (i->nr_segs > 1) {
|
|
|
|
if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
|
2023-03-29 14:52:15 +00:00
|
|
|
return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
|
2021-06-02 21:25:59 +00:00
|
|
|
if (iov_iter_is_bvec(i))
|
|
|
|
return min(i->count, i->bvec->bv_len - i->iov_offset);
|
|
|
|
}
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
if (unlikely(iov_iter_is_folioq(i)))
|
|
|
|
return !i->count ? 0 :
|
|
|
|
umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count);
|
2021-06-02 21:25:59 +00:00
|
|
|
return i->count;
|
2014-04-05 03:12:29 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_single_seg_count);
|
|
|
|
|
2018-10-19 23:57:56 +00:00
|
|
|
void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
|
2015-01-23 06:08:07 +00:00
|
|
|
const struct kvec *kvec, unsigned long nr_segs,
|
2014-11-24 19:46:11 +00:00
|
|
|
size_t count)
|
|
|
|
{
|
2018-10-19 23:57:56 +00:00
|
|
|
WARN_ON(direction & ~(READ | WRITE));
|
2021-04-22 18:50:39 +00:00
|
|
|
*i = (struct iov_iter){
|
|
|
|
.iter_type = ITER_KVEC,
|
|
|
|
.data_source = direction,
|
|
|
|
.kvec = kvec,
|
|
|
|
.nr_segs = nr_segs,
|
|
|
|
.iov_offset = 0,
|
|
|
|
.count = count
|
|
|
|
};
|
2014-11-24 19:46:11 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_kvec);
|
|
|
|
|
2018-10-19 23:57:56 +00:00
|
|
|
void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
|
2015-01-23 06:08:07 +00:00
|
|
|
const struct bio_vec *bvec, unsigned long nr_segs,
|
|
|
|
size_t count)
|
|
|
|
{
|
2018-10-19 23:57:56 +00:00
|
|
|
WARN_ON(direction & ~(READ | WRITE));
|
2021-04-22 18:50:39 +00:00
|
|
|
*i = (struct iov_iter){
|
|
|
|
.iter_type = ITER_BVEC,
|
|
|
|
.data_source = direction,
|
|
|
|
.bvec = bvec,
|
|
|
|
.nr_segs = nr_segs,
|
|
|
|
.iov_offset = 0,
|
|
|
|
.count = count
|
|
|
|
};
|
2015-01-23 06:08:07 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_bvec);
|
|
|
|
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
/**
|
|
|
|
* iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue
|
|
|
|
* @i: The iterator to initialise.
|
|
|
|
* @direction: The direction of the transfer.
|
|
|
|
* @folioq: The starting point in the folio queue.
|
|
|
|
* @first_slot: The first slot in the folio queue to use
|
|
|
|
* @offset: The offset into the folio in the first slot to start at
|
|
|
|
* @count: The size of the I/O buffer in bytes.
|
|
|
|
*
|
|
|
|
* Set up an I/O iterator to either draw data out of the pages attached to an
|
|
|
|
* inode or to inject data into those pages. The pages *must* be prevented
|
|
|
|
* from evaporation, either by taking a ref on them or locking them by the
|
|
|
|
* caller.
|
|
|
|
*/
|
|
|
|
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
|
|
|
|
const struct folio_queue *folioq, unsigned int first_slot,
|
|
|
|
unsigned int offset, size_t count)
|
|
|
|
{
|
|
|
|
BUG_ON(direction & ~1);
|
|
|
|
*i = (struct iov_iter) {
|
|
|
|
.iter_type = ITER_FOLIOQ,
|
|
|
|
.data_source = direction,
|
|
|
|
.folioq = folioq,
|
|
|
|
.folioq_slot = first_slot,
|
|
|
|
.count = count,
|
|
|
|
.iov_offset = offset,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_folio_queue);
|
|
|
|
|
2020-02-10 10:00:21 +00:00
|
|
|
/**
|
|
|
|
* iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
|
|
|
|
* @i: The iterator to initialise.
|
|
|
|
* @direction: The direction of the transfer.
|
|
|
|
* @xarray: The xarray to access.
|
|
|
|
* @start: The start file position.
|
|
|
|
* @count: The size of the I/O buffer in bytes.
|
|
|
|
*
|
|
|
|
* Set up an I/O iterator to either draw data out of the pages attached to an
|
|
|
|
* inode or to inject data into those pages. The pages *must* be prevented
|
|
|
|
* from evaporation, either by taking a ref on them or locking them by the
|
|
|
|
* caller.
|
|
|
|
*/
|
|
|
|
void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
|
|
|
|
struct xarray *xarray, loff_t start, size_t count)
|
|
|
|
{
|
|
|
|
BUG_ON(direction & ~1);
|
2021-04-22 18:50:39 +00:00
|
|
|
*i = (struct iov_iter) {
|
|
|
|
.iter_type = ITER_XARRAY,
|
|
|
|
.data_source = direction,
|
|
|
|
.xarray = xarray,
|
|
|
|
.xarray_start = start,
|
|
|
|
.count = count,
|
|
|
|
.iov_offset = 0
|
|
|
|
};
|
2020-02-10 10:00:21 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_xarray);
|
|
|
|
|
2018-10-19 23:57:56 +00:00
|
|
|
/**
|
|
|
|
* iov_iter_discard - Initialise an I/O iterator that discards data
|
|
|
|
* @i: The iterator to initialise.
|
|
|
|
* @direction: The direction of the transfer.
|
|
|
|
* @count: The size of the I/O buffer in bytes.
|
|
|
|
*
|
|
|
|
* Set up an I/O iterator that just discards everything that's written to it.
|
|
|
|
* It's only available as a READ iterator.
|
|
|
|
*/
|
|
|
|
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
|
|
|
|
{
|
|
|
|
BUG_ON(direction != READ);
|
2021-04-22 18:50:39 +00:00
|
|
|
*i = (struct iov_iter){
|
|
|
|
.iter_type = ITER_DISCARD,
|
|
|
|
.data_source = false,
|
|
|
|
.count = count,
|
|
|
|
.iov_offset = 0
|
|
|
|
};
|
2018-10-19 23:57:56 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_discard);
|
|
|
|
|
2022-06-10 19:58:27 +00:00
|
|
|
static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
|
|
|
|
unsigned len_mask)
|
|
|
|
{
|
2024-01-23 22:24:46 +00:00
|
|
|
const struct iovec *iov = iter_iov(i);
|
2022-06-10 19:58:27 +00:00
|
|
|
size_t size = i->count;
|
|
|
|
size_t skip = i->iov_offset;
|
|
|
|
|
2024-01-23 22:24:46 +00:00
|
|
|
do {
|
2023-03-29 14:52:15 +00:00
|
|
|
size_t len = iov->iov_len - skip;
|
2022-06-10 19:58:27 +00:00
|
|
|
|
|
|
|
if (len > size)
|
|
|
|
len = size;
|
|
|
|
if (len & len_mask)
|
|
|
|
return false;
|
2023-03-29 14:52:15 +00:00
|
|
|
if ((unsigned long)(iov->iov_base + skip) & addr_mask)
|
2022-06-10 19:58:27 +00:00
|
|
|
return false;
|
|
|
|
|
2024-01-23 22:24:46 +00:00
|
|
|
iov++;
|
2022-06-10 19:58:27 +00:00
|
|
|
size -= len;
|
2024-01-23 22:24:46 +00:00
|
|
|
skip = 0;
|
|
|
|
} while (size);
|
|
|
|
|
2022-06-10 19:58:27 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
|
|
|
|
unsigned len_mask)
|
|
|
|
{
|
2024-01-23 22:24:46 +00:00
|
|
|
const struct bio_vec *bvec = i->bvec;
|
2022-06-10 19:58:27 +00:00
|
|
|
unsigned skip = i->iov_offset;
|
2024-01-23 22:24:46 +00:00
|
|
|
size_t size = i->count;
|
2022-06-10 19:58:27 +00:00
|
|
|
|
2024-01-23 22:24:46 +00:00
|
|
|
do {
|
|
|
|
size_t len = bvec->bv_len;
|
2022-06-10 19:58:27 +00:00
|
|
|
|
|
|
|
if (len > size)
|
|
|
|
len = size;
|
|
|
|
if (len & len_mask)
|
|
|
|
return false;
|
2024-01-23 22:24:46 +00:00
|
|
|
if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
|
2022-06-10 19:58:27 +00:00
|
|
|
return false;
|
|
|
|
|
2024-01-23 22:24:46 +00:00
|
|
|
bvec++;
|
2022-06-10 19:58:27 +00:00
|
|
|
size -= len;
|
2024-01-23 22:24:46 +00:00
|
|
|
skip = 0;
|
|
|
|
} while (size);
|
|
|
|
|
2022-06-10 19:58:27 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* iov_iter_is_aligned() - Check if the addresses and lengths of each segments
|
|
|
|
* are aligned to the parameters.
|
|
|
|
*
|
|
|
|
* @i: &struct iov_iter to restore
|
|
|
|
* @addr_mask: bit mask to check against the iov element's addresses
|
|
|
|
* @len_mask: bit mask to check against the iov element's lengths
|
|
|
|
*
|
|
|
|
* Return: false if any addresses or lengths intersect with the provided masks
|
|
|
|
*/
|
|
|
|
bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
|
|
|
|
unsigned len_mask)
|
|
|
|
{
|
2022-05-22 18:59:25 +00:00
|
|
|
if (likely(iter_is_ubuf(i))) {
|
|
|
|
if (i->count & len_mask)
|
|
|
|
return false;
|
|
|
|
if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2022-06-10 19:58:27 +00:00
|
|
|
if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
|
|
|
|
return iov_iter_aligned_iovec(i, addr_mask, len_mask);
|
|
|
|
|
|
|
|
if (iov_iter_is_bvec(i))
|
|
|
|
return iov_iter_aligned_bvec(i, addr_mask, len_mask);
|
|
|
|
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
/* With both xarray and folioq types, we're dealing with whole folios. */
|
2022-06-10 19:58:27 +00:00
|
|
|
if (iov_iter_is_xarray(i)) {
|
|
|
|
if (i->count & len_mask)
|
|
|
|
return false;
|
|
|
|
if ((i->xarray_start + i->iov_offset) & addr_mask)
|
|
|
|
return false;
|
|
|
|
}
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
if (iov_iter_is_folioq(i)) {
|
|
|
|
if (i->count & len_mask)
|
|
|
|
return false;
|
|
|
|
if (i->iov_offset & addr_mask)
|
|
|
|
return false;
|
|
|
|
}
|
2022-06-10 19:58:27 +00:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
|
|
|
|
|
2021-04-25 04:44:35 +00:00
|
|
|
static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
|
2014-04-05 03:12:29 +00:00
|
|
|
{
|
2024-01-23 22:24:46 +00:00
|
|
|
const struct iovec *iov = iter_iov(i);
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-27 18:51:41 +00:00
|
|
|
unsigned long res = 0;
|
|
|
|
size_t size = i->count;
|
2021-04-25 04:44:35 +00:00
|
|
|
size_t skip = i->iov_offset;
|
|
|
|
|
2024-01-23 22:24:46 +00:00
|
|
|
do {
|
2023-03-29 14:52:15 +00:00
|
|
|
size_t len = iov->iov_len - skip;
|
2021-04-25 04:44:35 +00:00
|
|
|
if (len) {
|
2023-03-29 14:52:15 +00:00
|
|
|
res |= (unsigned long)iov->iov_base + skip;
|
2021-04-25 04:44:35 +00:00
|
|
|
if (len > size)
|
|
|
|
len = size;
|
|
|
|
res |= len;
|
|
|
|
size -= len;
|
|
|
|
}
|
2024-01-23 22:24:46 +00:00
|
|
|
iov++;
|
|
|
|
skip = 0;
|
|
|
|
} while (size);
|
2021-04-25 04:44:35 +00:00
|
|
|
return res;
|
|
|
|
}
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-27 18:51:41 +00:00
|
|
|
|
2021-04-25 04:44:35 +00:00
|
|
|
static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
|
|
|
|
{
|
2024-01-23 22:24:46 +00:00
|
|
|
const struct bio_vec *bvec = i->bvec;
|
2021-04-25 04:44:35 +00:00
|
|
|
unsigned res = 0;
|
|
|
|
size_t size = i->count;
|
|
|
|
unsigned skip = i->iov_offset;
|
|
|
|
|
2024-01-23 22:24:46 +00:00
|
|
|
do {
|
|
|
|
size_t len = bvec->bv_len - skip;
|
|
|
|
res |= (unsigned long)bvec->bv_offset + skip;
|
2021-04-25 04:44:35 +00:00
|
|
|
if (len > size)
|
|
|
|
len = size;
|
|
|
|
res |= len;
|
2024-01-23 22:24:46 +00:00
|
|
|
bvec++;
|
2021-04-25 04:44:35 +00:00
|
|
|
size -= len;
|
2024-01-23 22:24:46 +00:00
|
|
|
skip = 0;
|
|
|
|
} while (size);
|
|
|
|
|
2021-04-25 04:44:35 +00:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long iov_iter_alignment(const struct iov_iter *i)
|
|
|
|
{
|
2022-05-22 18:59:25 +00:00
|
|
|
if (likely(iter_is_ubuf(i))) {
|
|
|
|
size_t size = i->count;
|
|
|
|
if (size)
|
|
|
|
return ((unsigned long)i->ubuf + i->iov_offset) | size;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-04-25 04:44:35 +00:00
|
|
|
/* iovec and kvec have identical layouts */
|
|
|
|
if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
|
|
|
|
return iov_iter_alignment_iovec(i);
|
|
|
|
|
|
|
|
if (iov_iter_is_bvec(i))
|
|
|
|
return iov_iter_alignment_bvec(i);
|
|
|
|
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
/* With both xarray and folioq types, we're dealing with whole folios. */
|
|
|
|
if (iov_iter_is_folioq(i))
|
|
|
|
return i->iov_offset | i->count;
|
2021-04-25 04:44:35 +00:00
|
|
|
if (iov_iter_is_xarray(i))
|
2021-04-25 21:02:38 +00:00
|
|
|
return (i->xarray_start + i->iov_offset) | i->count;
|
2021-04-25 04:44:35 +00:00
|
|
|
|
|
|
|
return 0;
|
2014-04-05 03:12:29 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_alignment);
|
|
|
|
|
2016-04-08 23:05:19 +00:00
|
|
|
unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
|
|
|
|
{
|
2016-12-22 02:55:02 +00:00
|
|
|
unsigned long res = 0;
|
2021-04-25 05:03:16 +00:00
|
|
|
unsigned long v = 0;
|
2016-04-08 23:05:19 +00:00
|
|
|
size_t size = i->count;
|
2021-04-25 05:03:16 +00:00
|
|
|
unsigned k;
|
2016-04-08 23:05:19 +00:00
|
|
|
|
2022-05-22 18:59:25 +00:00
|
|
|
if (iter_is_ubuf(i))
|
|
|
|
return 0;
|
|
|
|
|
2021-04-25 05:03:16 +00:00
|
|
|
if (WARN_ON(!iter_is_iovec(i)))
|
2016-09-22 20:33:12 +00:00
|
|
|
return ~0U;
|
|
|
|
|
2021-04-25 05:03:16 +00:00
|
|
|
for (k = 0; k < i->nr_segs; k++) {
|
2023-03-29 14:52:15 +00:00
|
|
|
const struct iovec *iov = iter_iov(i) + k;
|
|
|
|
if (iov->iov_len) {
|
|
|
|
unsigned long base = (unsigned long)iov->iov_base;
|
2021-04-25 05:03:16 +00:00
|
|
|
if (v) // if not the first one
|
|
|
|
res |= base | v; // this start | previous end
|
2023-03-29 14:52:15 +00:00
|
|
|
v = base + iov->iov_len;
|
|
|
|
if (size <= iov->iov_len)
|
2021-04-25 05:03:16 +00:00
|
|
|
break;
|
2023-03-29 14:52:15 +00:00
|
|
|
size -= iov->iov_len;
|
2021-04-25 05:03:16 +00:00
|
|
|
}
|
|
|
|
}
|
2016-12-22 02:55:02 +00:00
|
|
|
return res;
|
2016-04-08 23:05:19 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_gap_alignment);
|
|
|
|
|
2022-06-17 18:45:41 +00:00
|
|
|
static int want_pages_array(struct page ***res, size_t size,
|
|
|
|
size_t start, unsigned int maxpages)
|
2022-06-17 17:35:35 +00:00
|
|
|
{
|
2022-06-17 18:45:41 +00:00
|
|
|
unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE);
|
|
|
|
|
|
|
|
if (count > maxpages)
|
|
|
|
count = maxpages;
|
|
|
|
WARN_ON(!count); // caller should've prevented that
|
|
|
|
if (!*res) {
|
|
|
|
*res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
|
|
|
|
if (!*res)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return count;
|
2022-06-17 17:35:35 +00:00
|
|
|
}
|
|
|
|
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
static ssize_t iter_folioq_get_pages(struct iov_iter *iter,
|
|
|
|
struct page ***ppages, size_t maxsize,
|
|
|
|
unsigned maxpages, size_t *_start_offset)
|
|
|
|
{
|
|
|
|
const struct folio_queue *folioq = iter->folioq;
|
|
|
|
struct page **pages;
|
|
|
|
unsigned int slot = iter->folioq_slot;
|
|
|
|
size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset;
|
|
|
|
|
|
|
|
if (slot >= folioq_nr_slots(folioq)) {
|
|
|
|
folioq = folioq->next;
|
|
|
|
slot = 0;
|
|
|
|
if (WARN_ON(iov_offset != 0))
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages);
|
|
|
|
if (!maxpages)
|
|
|
|
return -ENOMEM;
|
|
|
|
*_start_offset = iov_offset & ~PAGE_MASK;
|
|
|
|
pages = *ppages;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
struct folio *folio = folioq_folio(folioq, slot);
|
|
|
|
size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot);
|
|
|
|
size_t part = PAGE_SIZE - offset % PAGE_SIZE;
|
|
|
|
|
|
|
|
part = umin(part, umin(maxsize - extracted, fsize - offset));
|
|
|
|
count -= part;
|
|
|
|
iov_offset += part;
|
|
|
|
extracted += part;
|
|
|
|
|
|
|
|
*pages = folio_page(folio, offset / PAGE_SIZE);
|
|
|
|
get_page(*pages);
|
|
|
|
pages++;
|
|
|
|
maxpages--;
|
|
|
|
if (maxpages == 0 || extracted >= maxsize)
|
|
|
|
break;
|
|
|
|
|
2024-09-30 18:55:00 +00:00
|
|
|
if (iov_offset >= fsize) {
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
iov_offset = 0;
|
|
|
|
slot++;
|
|
|
|
if (slot == folioq_nr_slots(folioq) && folioq->next) {
|
|
|
|
folioq = folioq->next;
|
|
|
|
slot = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
iter->count = count;
|
|
|
|
iter->iov_offset = iov_offset;
|
|
|
|
iter->folioq = folioq;
|
|
|
|
iter->folioq_slot = slot;
|
|
|
|
return extracted;
|
|
|
|
}
|
|
|
|
|
2020-02-10 10:00:21 +00:00
|
|
|
static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
|
|
|
|
pgoff_t index, unsigned int nr_pages)
|
|
|
|
{
|
|
|
|
XA_STATE(xas, xa, index);
|
|
|
|
struct page *page;
|
|
|
|
unsigned int ret = 0;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
for (page = xas_load(&xas); page; page = xas_next(&xas)) {
|
|
|
|
if (xas_retry(&xas, page))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Has the page moved or been split? */
|
|
|
|
if (unlikely(page != xas_reload(&xas))) {
|
|
|
|
xas_reset(&xas);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
pages[ret] = find_subpage(page, xas.xa_index);
|
|
|
|
get_page(pages[ret]);
|
|
|
|
if (++ret == nr_pages)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t iter_xarray_get_pages(struct iov_iter *i,
|
2022-06-17 17:48:03 +00:00
|
|
|
struct page ***pages, size_t maxsize,
|
2020-02-10 10:00:21 +00:00
|
|
|
unsigned maxpages, size_t *_start_offset)
|
|
|
|
{
|
2022-06-17 18:45:41 +00:00
|
|
|
unsigned nr, offset, count;
|
|
|
|
pgoff_t index;
|
2020-02-10 10:00:21 +00:00
|
|
|
loff_t pos;
|
|
|
|
|
|
|
|
pos = i->xarray_start + i->iov_offset;
|
|
|
|
index = pos >> PAGE_SHIFT;
|
|
|
|
offset = pos & ~PAGE_MASK;
|
|
|
|
*_start_offset = offset;
|
|
|
|
|
2022-06-17 18:45:41 +00:00
|
|
|
count = want_pages_array(pages, maxsize, offset, maxpages);
|
|
|
|
if (!count)
|
|
|
|
return -ENOMEM;
|
2022-06-17 17:48:03 +00:00
|
|
|
nr = iter_xarray_populate_pages(*pages, i->xarray, index, count);
|
2020-02-10 10:00:21 +00:00
|
|
|
if (nr == 0)
|
|
|
|
return 0;
|
|
|
|
|
2022-06-10 17:05:12 +00:00
|
|
|
maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
|
2022-06-11 08:04:33 +00:00
|
|
|
i->iov_offset += maxsize;
|
|
|
|
i->count -= maxsize;
|
2022-06-10 17:05:12 +00:00
|
|
|
return maxsize;
|
2020-02-10 10:00:21 +00:00
|
|
|
}
|
|
|
|
|
2022-05-22 18:59:25 +00:00
|
|
|
/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
|
2022-06-17 20:07:49 +00:00
|
|
|
static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
|
2021-04-25 13:14:44 +00:00
|
|
|
{
|
|
|
|
size_t skip;
|
|
|
|
long k;
|
|
|
|
|
2022-05-22 18:59:25 +00:00
|
|
|
if (iter_is_ubuf(i))
|
|
|
|
return (unsigned long)i->ubuf + i->iov_offset;
|
|
|
|
|
2021-04-25 13:14:44 +00:00
|
|
|
for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
|
2023-03-29 14:52:15 +00:00
|
|
|
const struct iovec *iov = iter_iov(i) + k;
|
|
|
|
size_t len = iov->iov_len - skip;
|
2021-04-25 13:14:44 +00:00
|
|
|
|
|
|
|
if (unlikely(!len))
|
|
|
|
continue;
|
2022-06-21 20:10:37 +00:00
|
|
|
if (*size > len)
|
|
|
|
*size = len;
|
2023-03-29 14:52:15 +00:00
|
|
|
return (unsigned long)iov->iov_base + skip;
|
2021-04-25 13:14:44 +00:00
|
|
|
}
|
|
|
|
BUG(); // if it had been empty, we wouldn't get called
|
|
|
|
}
|
|
|
|
|
|
|
|
/* must be done on non-empty ITER_BVEC one */
|
|
|
|
static struct page *first_bvec_segment(const struct iov_iter *i,
|
2022-06-21 20:10:37 +00:00
|
|
|
size_t *size, size_t *start)
|
2021-04-25 13:14:44 +00:00
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
size_t skip = i->iov_offset, len;
|
|
|
|
|
|
|
|
len = i->bvec->bv_len - skip;
|
2022-06-21 20:10:37 +00:00
|
|
|
if (*size > len)
|
|
|
|
*size = len;
|
2021-04-25 13:14:44 +00:00
|
|
|
skip += i->bvec->bv_offset;
|
|
|
|
page = i->bvec->bv_page + skip / PAGE_SIZE;
|
2022-06-21 19:55:19 +00:00
|
|
|
*start = skip % PAGE_SIZE;
|
2021-04-25 13:14:44 +00:00
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
2022-06-17 17:54:15 +00:00
|
|
|
static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
|
|
|
|
struct page ***pages, size_t maxsize,
|
2023-06-14 14:03:41 +00:00
|
|
|
unsigned int maxpages, size_t *start)
|
2014-04-05 03:12:29 +00:00
|
|
|
{
|
2023-01-19 12:47:23 +00:00
|
|
|
unsigned int n, gup_flags = 0;
|
2021-04-25 13:14:44 +00:00
|
|
|
|
2014-11-27 19:12:09 +00:00
|
|
|
if (maxsize > i->count)
|
|
|
|
maxsize = i->count;
|
2022-06-17 17:54:15 +00:00
|
|
|
if (!maxsize)
|
2021-04-25 13:14:44 +00:00
|
|
|
return 0;
|
2022-06-11 20:44:21 +00:00
|
|
|
if (maxsize > MAX_RW_COUNT)
|
|
|
|
maxsize = MAX_RW_COUNT;
|
2014-11-27 19:12:09 +00:00
|
|
|
|
2022-05-22 18:59:25 +00:00
|
|
|
if (likely(user_backed_iter(i))) {
|
2021-04-25 13:14:44 +00:00
|
|
|
unsigned long addr;
|
2022-06-17 18:45:41 +00:00
|
|
|
int res;
|
2014-11-27 19:12:09 +00:00
|
|
|
|
2021-07-12 10:06:14 +00:00
|
|
|
if (iov_iter_rw(i) != WRITE)
|
|
|
|
gup_flags |= FOLL_WRITE;
|
|
|
|
if (i->nofault)
|
|
|
|
gup_flags |= FOLL_NOFAULT;
|
|
|
|
|
2022-06-17 20:07:49 +00:00
|
|
|
addr = first_iovec_segment(i, &maxsize);
|
|
|
|
*start = addr % PAGE_SIZE;
|
|
|
|
addr &= PAGE_MASK;
|
2022-06-17 18:45:41 +00:00
|
|
|
n = want_pages_array(pages, maxsize, *start, maxpages);
|
|
|
|
if (!n)
|
|
|
|
return -ENOMEM;
|
2022-06-17 17:54:15 +00:00
|
|
|
res = get_user_pages_fast(addr, n, gup_flags, *pages);
|
2021-07-21 17:03:47 +00:00
|
|
|
if (unlikely(res <= 0))
|
2014-11-27 19:12:09 +00:00
|
|
|
return res;
|
2022-06-10 17:05:12 +00:00
|
|
|
maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
|
|
|
|
iov_iter_advance(i, maxsize);
|
|
|
|
return maxsize;
|
2021-04-25 13:14:44 +00:00
|
|
|
}
|
|
|
|
if (iov_iter_is_bvec(i)) {
|
2022-06-17 17:54:15 +00:00
|
|
|
struct page **p;
|
2021-04-25 13:14:44 +00:00
|
|
|
struct page *page;
|
|
|
|
|
2022-06-21 20:10:37 +00:00
|
|
|
page = first_bvec_segment(i, &maxsize, start);
|
2022-06-17 18:45:41 +00:00
|
|
|
n = want_pages_array(pages, maxsize, *start, maxpages);
|
|
|
|
if (!n)
|
|
|
|
return -ENOMEM;
|
2022-06-17 17:54:15 +00:00
|
|
|
p = *pages;
|
2022-06-21 19:55:19 +00:00
|
|
|
for (int k = 0; k < n; k++)
|
2022-06-10 17:05:12 +00:00
|
|
|
get_page(p[k] = page + k);
|
|
|
|
maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
|
2022-06-11 08:04:33 +00:00
|
|
|
i->count -= maxsize;
|
|
|
|
i->iov_offset += maxsize;
|
|
|
|
if (i->iov_offset == i->bvec->bv_len) {
|
|
|
|
i->iov_offset = 0;
|
|
|
|
i->bvec++;
|
|
|
|
i->nr_segs--;
|
|
|
|
}
|
2022-06-10 17:05:12 +00:00
|
|
|
return maxsize;
|
2021-04-25 13:14:44 +00:00
|
|
|
}
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
if (iov_iter_is_folioq(i))
|
|
|
|
return iter_folioq_get_pages(i, pages, maxsize, maxpages, start);
|
2021-04-25 13:14:44 +00:00
|
|
|
if (iov_iter_is_xarray(i))
|
2022-06-17 17:54:15 +00:00
|
|
|
return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
|
2021-04-25 13:14:44 +00:00
|
|
|
return -EFAULT;
|
2014-04-05 03:12:29 +00:00
|
|
|
}
|
|
|
|
|
2023-06-14 14:03:41 +00:00
|
|
|
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
|
|
|
|
size_t maxsize, unsigned maxpages, size_t *start)
|
2014-04-05 03:12:29 +00:00
|
|
|
{
|
2022-06-17 17:54:15 +00:00
|
|
|
if (!maxpages)
|
2021-04-25 13:14:44 +00:00
|
|
|
return 0;
|
2022-06-17 17:54:15 +00:00
|
|
|
BUG_ON(!pages);
|
2021-04-25 13:14:44 +00:00
|
|
|
|
2023-06-14 14:03:41 +00:00
|
|
|
return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
|
2014-04-05 03:12:29 +00:00
|
|
|
}
|
2022-06-10 17:05:12 +00:00
|
|
|
EXPORT_SYMBOL(iov_iter_get_pages2);
|
2022-06-11 00:38:20 +00:00
|
|
|
|
2023-06-14 14:03:41 +00:00
|
|
|
ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
|
|
|
|
struct page ***pages, size_t maxsize, size_t *start)
|
2022-06-11 00:38:20 +00:00
|
|
|
{
|
|
|
|
ssize_t len;
|
|
|
|
|
|
|
|
*pages = NULL;
|
|
|
|
|
2023-06-14 14:03:41 +00:00
|
|
|
len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start);
|
2022-06-11 00:38:20 +00:00
|
|
|
if (len <= 0) {
|
|
|
|
kvfree(*pages);
|
|
|
|
*pages = NULL;
|
|
|
|
}
|
|
|
|
return len;
|
|
|
|
}
|
2022-06-10 17:05:12 +00:00
|
|
|
EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
|
2014-04-05 03:12:29 +00:00
|
|
|
|
2021-04-25 20:00:48 +00:00
|
|
|
static int iov_npages(const struct iov_iter *i, int maxpages)
|
2014-04-05 03:12:29 +00:00
|
|
|
{
|
2021-04-25 20:00:48 +00:00
|
|
|
size_t skip = i->iov_offset, size = i->count;
|
|
|
|
const struct iovec *p;
|
2014-11-27 19:09:46 +00:00
|
|
|
int npages = 0;
|
|
|
|
|
2023-03-29 14:52:15 +00:00
|
|
|
for (p = iter_iov(i); size; skip = 0, p++) {
|
2021-04-25 20:00:48 +00:00
|
|
|
unsigned offs = offset_in_page(p->iov_base + skip);
|
|
|
|
size_t len = min(p->iov_len - skip, size);
|
2014-11-27 19:09:46 +00:00
|
|
|
|
2021-04-25 20:00:48 +00:00
|
|
|
if (len) {
|
|
|
|
size -= len;
|
|
|
|
npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
|
|
|
|
if (unlikely(npages > maxpages))
|
|
|
|
return maxpages;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return npages;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bvec_npages(const struct iov_iter *i, int maxpages)
|
|
|
|
{
|
|
|
|
size_t skip = i->iov_offset, size = i->count;
|
|
|
|
const struct bio_vec *p;
|
|
|
|
int npages = 0;
|
|
|
|
|
|
|
|
for (p = i->bvec; size; skip = 0, p++) {
|
|
|
|
unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
|
|
|
|
size_t len = min(p->bv_len - skip, size);
|
|
|
|
|
|
|
|
size -= len;
|
|
|
|
npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
|
|
|
|
if (unlikely(npages > maxpages))
|
|
|
|
return maxpages;
|
|
|
|
}
|
|
|
|
return npages;
|
|
|
|
}
|
|
|
|
|
|
|
|
int iov_iter_npages(const struct iov_iter *i, int maxpages)
|
|
|
|
{
|
|
|
|
if (unlikely(!i->count))
|
|
|
|
return 0;
|
2022-05-22 18:59:25 +00:00
|
|
|
if (likely(iter_is_ubuf(i))) {
|
|
|
|
unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
|
|
|
|
int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
|
|
|
|
return min(npages, maxpages);
|
|
|
|
}
|
2021-04-25 20:00:48 +00:00
|
|
|
/* iovec and kvec have identical layouts */
|
|
|
|
if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
|
|
|
|
return iov_npages(i, maxpages);
|
|
|
|
if (iov_iter_is_bvec(i))
|
|
|
|
return bvec_npages(i, maxpages);
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
if (iov_iter_is_folioq(i)) {
|
|
|
|
unsigned offset = i->iov_offset % PAGE_SIZE;
|
|
|
|
int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
|
|
|
|
return min(npages, maxpages);
|
|
|
|
}
|
2021-04-25 20:00:48 +00:00
|
|
|
if (iov_iter_is_xarray(i)) {
|
2021-05-03 15:05:29 +00:00
|
|
|
unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
|
|
|
|
int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
|
2021-04-25 20:00:48 +00:00
|
|
|
return min(npages, maxpages);
|
|
|
|
}
|
|
|
|
return 0;
|
2014-04-05 03:12:29 +00:00
|
|
|
}
|
2014-03-19 05:16:16 +00:00
|
|
|
EXPORT_SYMBOL(iov_iter_npages);
|
2015-02-01 01:08:47 +00:00
|
|
|
|
|
|
|
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
|
|
|
|
{
|
|
|
|
*new = *old;
|
2018-10-22 12:07:28 +00:00
|
|
|
if (iov_iter_is_bvec(new))
|
2015-02-01 01:08:47 +00:00
|
|
|
return new->bvec = kmemdup(new->bvec,
|
|
|
|
new->nr_segs * sizeof(struct bio_vec),
|
|
|
|
flags);
|
2022-05-22 18:59:25 +00:00
|
|
|
else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
|
2015-02-01 01:08:47 +00:00
|
|
|
/* iovec and kvec have identical layout */
|
2023-03-29 14:52:15 +00:00
|
|
|
return new->__iov = kmemdup(new->__iov,
|
2015-02-01 01:08:47 +00:00
|
|
|
new->nr_segs * sizeof(struct iovec),
|
|
|
|
flags);
|
2022-05-22 18:59:25 +00:00
|
|
|
return NULL;
|
2015-02-01 01:08:47 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dup_iter);
|
saner iov_iter initialization primitives
iovec-backed iov_iter instances are assumed to satisfy several properties:
* no more than UIO_MAXIOV elements in iovec array
* total size of all ranges is no more than MAX_RW_COUNT
* all ranges pass access_ok().
The problem is, invariants of data structures should be established in the
primitives creating those data structures, not in the code using those
primitives. And iov_iter_init() violates that principle. For a while we
managed to get away with that, but once the use of iov_iter started to
spread, it didn't take long for shit to hit the fan - missed check in
sys_sendto() had introduced a roothole.
We _do_ have primitives for importing and validating iovecs (both native and
compat ones) and those primitives are almost always followed by shoving the
resulting iovec into iov_iter. Life would be considerably simpler (and safer)
if we combined those primitives with initializing iov_iter.
That gives us two new primitives - import_iovec() and compat_import_iovec().
Calling conventions:
iovec = iov_array;
err = import_iovec(direction, uvec, nr_segs,
ARRAY_SIZE(iov_array), &iovec,
&iter);
imports user vector into kernel space (into iov_array if it fits, allocated
if it doesn't fit or if iovec was NULL), validates it and sets iter up to
refer to it. On success 0 is returned and allocated kernel copy (or NULL
if the array had fit into caller-supplied one) is returned via iovec.
On failure all allocations are undone and -E... is returned. If the total
size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated.
compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec;
otherwise it's identical to import_iovec().
Finally, import_single_range() sets iov_iter backed by single-element iovec
covering a user-supplied range -
err = import_single_range(direction, address, size, iovec, &iter);
does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets
silently truncated.
Next commits will be switching the things up to use of those and reducing
the amount of iov_iter_init() instances.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-03-21 21:45:43 +00:00
|
|
|
|
2023-04-12 16:46:48 +00:00
|
|
|
static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
|
2024-01-29 18:37:29 +00:00
|
|
|
const struct iovec __user *uvec, u32 nr_segs)
|
2020-09-25 04:51:40 +00:00
|
|
|
{
|
|
|
|
const struct compat_iovec __user *uiov =
|
|
|
|
(const struct compat_iovec __user *)uvec;
|
2024-01-29 18:37:29 +00:00
|
|
|
int ret = -EFAULT;
|
|
|
|
u32 i;
|
2020-09-25 04:51:40 +00:00
|
|
|
|
2021-01-11 17:19:26 +00:00
|
|
|
if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
|
2020-09-25 04:51:40 +00:00
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_segs; i++) {
|
|
|
|
compat_uptr_t buf;
|
|
|
|
compat_ssize_t len;
|
|
|
|
|
|
|
|
unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
|
|
|
|
unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
|
|
|
|
|
|
|
|
/* check for compat_size_t not fitting in compat_ssize_t .. */
|
|
|
|
if (len < 0) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto uaccess_end;
|
|
|
|
}
|
|
|
|
iov[i].iov_base = compat_ptr(buf);
|
|
|
|
iov[i].iov_len = len;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
uaccess_end:
|
|
|
|
user_access_end();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-06-16 12:43:55 +00:00
|
|
|
static __noclone int copy_iovec_from_user(struct iovec *iov,
|
2023-03-30 21:53:51 +00:00
|
|
|
const struct iovec __user *uiov, unsigned long nr_segs)
|
2020-09-25 04:51:39 +00:00
|
|
|
{
|
2023-03-30 21:53:51 +00:00
|
|
|
int ret = -EFAULT;
|
2020-09-25 04:51:39 +00:00
|
|
|
|
2023-03-30 21:53:51 +00:00
|
|
|
if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
|
2020-09-25 04:51:40 +00:00
|
|
|
return -EFAULT;
|
2020-09-25 04:51:39 +00:00
|
|
|
|
2023-03-30 21:53:51 +00:00
|
|
|
do {
|
|
|
|
void __user *buf;
|
|
|
|
ssize_t len;
|
|
|
|
|
|
|
|
unsafe_get_user(len, &uiov->iov_len, uaccess_end);
|
|
|
|
unsafe_get_user(buf, &uiov->iov_base, uaccess_end);
|
|
|
|
|
|
|
|
/* check for size_t not fitting in ssize_t .. */
|
|
|
|
if (unlikely(len < 0)) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto uaccess_end;
|
|
|
|
}
|
|
|
|
iov->iov_base = buf;
|
|
|
|
iov->iov_len = len;
|
|
|
|
|
|
|
|
uiov++; iov++;
|
|
|
|
} while (--nr_segs);
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
uaccess_end:
|
|
|
|
user_access_end();
|
|
|
|
return ret;
|
2020-09-25 04:51:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
struct iovec *iovec_from_user(const struct iovec __user *uvec,
|
|
|
|
unsigned long nr_segs, unsigned long fast_segs,
|
|
|
|
struct iovec *fast_iov, bool compat)
|
|
|
|
{
|
|
|
|
struct iovec *iov = fast_iov;
|
|
|
|
int ret;
|
|
|
|
|
2020-09-25 04:51:39 +00:00
|
|
|
/*
|
2020-09-25 04:51:40 +00:00
|
|
|
* SuS says "The readv() function *may* fail if the iovcnt argument was
|
|
|
|
* less than or equal to 0, or greater than {IOV_MAX}. Linux has
|
|
|
|
* traditionally returned zero for zero segments, so...
|
2020-09-25 04:51:39 +00:00
|
|
|
*/
|
2020-09-25 04:51:40 +00:00
|
|
|
if (nr_segs == 0)
|
|
|
|
return iov;
|
|
|
|
if (nr_segs > UIO_MAXIOV)
|
|
|
|
return ERR_PTR(-EINVAL);
|
2020-09-25 04:51:39 +00:00
|
|
|
if (nr_segs > fast_segs) {
|
|
|
|
iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
|
2020-09-25 04:51:40 +00:00
|
|
|
if (!iov)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2020-09-25 04:51:39 +00:00
|
|
|
}
|
2020-09-25 04:51:40 +00:00
|
|
|
|
2023-03-30 21:53:51 +00:00
|
|
|
if (unlikely(compat))
|
2020-09-25 04:51:40 +00:00
|
|
|
ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
|
|
|
|
else
|
|
|
|
ret = copy_iovec_from_user(iov, uvec, nr_segs);
|
|
|
|
if (ret) {
|
|
|
|
if (iov != fast_iov)
|
|
|
|
kfree(iov);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
return iov;
|
|
|
|
}
|
|
|
|
|
2023-03-24 20:37:19 +00:00
|
|
|
/*
|
|
|
|
* Single segment iovec supplied by the user, import it as ITER_UBUF.
|
|
|
|
*/
|
|
|
|
static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
|
|
|
|
struct iovec **iovp, struct iov_iter *i,
|
|
|
|
bool compat)
|
|
|
|
{
|
|
|
|
struct iovec *iov = *iovp;
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
if (compat)
|
|
|
|
ret = copy_compat_iovec_from_user(iov, uvec, 1);
|
|
|
|
else
|
|
|
|
ret = copy_iovec_from_user(iov, uvec, 1);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
*iovp = NULL;
|
|
|
|
return i->count;
|
|
|
|
}
|
|
|
|
|
2020-09-25 04:51:40 +00:00
|
|
|
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
|
|
|
|
unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
|
|
|
|
struct iov_iter *i, bool compat)
|
|
|
|
{
|
|
|
|
ssize_t total_len = 0;
|
|
|
|
unsigned long seg;
|
|
|
|
struct iovec *iov;
|
|
|
|
|
2023-03-24 20:37:19 +00:00
|
|
|
if (nr_segs == 1)
|
|
|
|
return __import_iovec_ubuf(type, uvec, iovp, i, compat);
|
|
|
|
|
2020-09-25 04:51:40 +00:00
|
|
|
iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
|
|
|
|
if (IS_ERR(iov)) {
|
|
|
|
*iovp = NULL;
|
|
|
|
return PTR_ERR(iov);
|
2020-09-25 04:51:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-09-25 04:51:40 +00:00
|
|
|
* According to the Single Unix Specification we should return EINVAL if
|
|
|
|
* an element length is < 0 when cast to ssize_t or if the total length
|
|
|
|
* would overflow the ssize_t return value of the system call.
|
2020-09-25 04:51:39 +00:00
|
|
|
*
|
|
|
|
* Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
|
|
|
|
* overflow case.
|
|
|
|
*/
|
|
|
|
for (seg = 0; seg < nr_segs; seg++) {
|
|
|
|
ssize_t len = (ssize_t)iov[seg].iov_len;
|
|
|
|
|
2020-09-25 04:51:40 +00:00
|
|
|
if (!access_ok(iov[seg].iov_base, len)) {
|
|
|
|
if (iov != *iovp)
|
|
|
|
kfree(iov);
|
|
|
|
*iovp = NULL;
|
|
|
|
return -EFAULT;
|
2020-09-25 04:51:39 +00:00
|
|
|
}
|
2020-09-25 04:51:40 +00:00
|
|
|
|
|
|
|
if (len > MAX_RW_COUNT - total_len) {
|
|
|
|
len = MAX_RW_COUNT - total_len;
|
2020-09-25 04:51:39 +00:00
|
|
|
iov[seg].iov_len = len;
|
|
|
|
}
|
2020-09-25 04:51:40 +00:00
|
|
|
total_len += len;
|
2020-09-25 04:51:39 +00:00
|
|
|
}
|
2020-09-25 04:51:40 +00:00
|
|
|
|
|
|
|
iov_iter_init(i, type, iov, nr_segs, total_len);
|
|
|
|
if (iov == *iovp)
|
|
|
|
*iovp = NULL;
|
|
|
|
else
|
|
|
|
*iovp = iov;
|
|
|
|
return total_len;
|
2020-09-25 04:51:39 +00:00
|
|
|
}
|
|
|
|
|
2016-10-08 09:18:07 +00:00
|
|
|
/**
|
|
|
|
* import_iovec() - Copy an array of &struct iovec from userspace
|
|
|
|
* into the kernel, check that it is valid, and initialize a new
|
|
|
|
* &struct iov_iter iterator to access it.
|
|
|
|
*
|
|
|
|
* @type: One of %READ or %WRITE.
|
2020-09-25 04:51:40 +00:00
|
|
|
* @uvec: Pointer to the userspace array.
|
2016-10-08 09:18:07 +00:00
|
|
|
* @nr_segs: Number of elements in userspace array.
|
|
|
|
* @fast_segs: Number of elements in @iov.
|
2020-09-25 04:51:40 +00:00
|
|
|
* @iovp: (input and output parameter) Pointer to pointer to (usually small
|
2016-10-08 09:18:07 +00:00
|
|
|
* on-stack) kernel array.
|
|
|
|
* @i: Pointer to iterator that will be initialized on success.
|
|
|
|
*
|
|
|
|
* If the array pointed to by *@iov is large enough to hold all @nr_segs,
|
|
|
|
* then this function places %NULL in *@iov on return. Otherwise, a new
|
|
|
|
* array will be allocated and the result placed in *@iov. This means that
|
|
|
|
* the caller may call kfree() on *@iov regardless of whether the small
|
|
|
|
* on-stack array was used or not (and regardless of whether this function
|
|
|
|
* returns an error or not).
|
|
|
|
*
|
2019-05-14 22:02:22 +00:00
|
|
|
* Return: Negative error code on error, bytes imported on success
|
2016-10-08 09:18:07 +00:00
|
|
|
*/
|
2020-09-25 04:51:40 +00:00
|
|
|
ssize_t import_iovec(int type, const struct iovec __user *uvec,
|
saner iov_iter initialization primitives
iovec-backed iov_iter instances are assumed to satisfy several properties:
* no more than UIO_MAXIOV elements in iovec array
* total size of all ranges is no more than MAX_RW_COUNT
* all ranges pass access_ok().
The problem is, invariants of data structures should be established in the
primitives creating those data structures, not in the code using those
primitives. And iov_iter_init() violates that principle. For a while we
managed to get away with that, but once the use of iov_iter started to
spread, it didn't take long for shit to hit the fan - missed check in
sys_sendto() had introduced a roothole.
We _do_ have primitives for importing and validating iovecs (both native and
compat ones) and those primitives are almost always followed by shoving the
resulting iovec into iov_iter. Life would be considerably simpler (and safer)
if we combined those primitives with initializing iov_iter.
That gives us two new primitives - import_iovec() and compat_import_iovec().
Calling conventions:
iovec = iov_array;
err = import_iovec(direction, uvec, nr_segs,
ARRAY_SIZE(iov_array), &iovec,
&iter);
imports user vector into kernel space (into iov_array if it fits, allocated
if it doesn't fit or if iovec was NULL), validates it and sets iter up to
refer to it. On success 0 is returned and allocated kernel copy (or NULL
if the array had fit into caller-supplied one) is returned via iovec.
On failure all allocations are undone and -E... is returned. If the total
size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated.
compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec;
otherwise it's identical to import_iovec().
Finally, import_single_range() sets iov_iter backed by single-element iovec
covering a user-supplied range -
err = import_single_range(direction, address, size, iovec, &iter);
does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets
silently truncated.
Next commits will be switching the things up to use of those and reducing
the amount of iov_iter_init() instances.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-03-21 21:45:43 +00:00
|
|
|
unsigned nr_segs, unsigned fast_segs,
|
2020-09-25 04:51:40 +00:00
|
|
|
struct iovec **iovp, struct iov_iter *i)
|
saner iov_iter initialization primitives
iovec-backed iov_iter instances are assumed to satisfy several properties:
* no more than UIO_MAXIOV elements in iovec array
* total size of all ranges is no more than MAX_RW_COUNT
* all ranges pass access_ok().
The problem is, invariants of data structures should be established in the
primitives creating those data structures, not in the code using those
primitives. And iov_iter_init() violates that principle. For a while we
managed to get away with that, but once the use of iov_iter started to
spread, it didn't take long for shit to hit the fan - missed check in
sys_sendto() had introduced a roothole.
We _do_ have primitives for importing and validating iovecs (both native and
compat ones) and those primitives are almost always followed by shoving the
resulting iovec into iov_iter. Life would be considerably simpler (and safer)
if we combined those primitives with initializing iov_iter.
That gives us two new primitives - import_iovec() and compat_import_iovec().
Calling conventions:
iovec = iov_array;
err = import_iovec(direction, uvec, nr_segs,
ARRAY_SIZE(iov_array), &iovec,
&iter);
imports user vector into kernel space (into iov_array if it fits, allocated
if it doesn't fit or if iovec was NULL), validates it and sets iter up to
refer to it. On success 0 is returned and allocated kernel copy (or NULL
if the array had fit into caller-supplied one) is returned via iovec.
On failure all allocations are undone and -E... is returned. If the total
size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated.
compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec;
otherwise it's identical to import_iovec().
Finally, import_single_range() sets iov_iter backed by single-element iovec
covering a user-supplied range -
err = import_single_range(direction, address, size, iovec, &iter);
does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets
silently truncated.
Next commits will be switching the things up to use of those and reducing
the amount of iov_iter_init() instances.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-03-21 21:45:43 +00:00
|
|
|
{
|
2020-09-25 04:51:41 +00:00
|
|
|
return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
|
|
|
|
in_compat_syscall());
|
saner iov_iter initialization primitives
iovec-backed iov_iter instances are assumed to satisfy several properties:
* no more than UIO_MAXIOV elements in iovec array
* total size of all ranges is no more than MAX_RW_COUNT
* all ranges pass access_ok().
The problem is, invariants of data structures should be established in the
primitives creating those data structures, not in the code using those
primitives. And iov_iter_init() violates that principle. For a while we
managed to get away with that, but once the use of iov_iter started to
spread, it didn't take long for shit to hit the fan - missed check in
sys_sendto() had introduced a roothole.
We _do_ have primitives for importing and validating iovecs (both native and
compat ones) and those primitives are almost always followed by shoving the
resulting iovec into iov_iter. Life would be considerably simpler (and safer)
if we combined those primitives with initializing iov_iter.
That gives us two new primitives - import_iovec() and compat_import_iovec().
Calling conventions:
iovec = iov_array;
err = import_iovec(direction, uvec, nr_segs,
ARRAY_SIZE(iov_array), &iovec,
&iter);
imports user vector into kernel space (into iov_array if it fits, allocated
if it doesn't fit or if iovec was NULL), validates it and sets iter up to
refer to it. On success 0 is returned and allocated kernel copy (or NULL
if the array had fit into caller-supplied one) is returned via iovec.
On failure all allocations are undone and -E... is returned. If the total
size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated.
compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec;
otherwise it's identical to import_iovec().
Finally, import_single_range() sets iov_iter backed by single-element iovec
covering a user-supplied range -
err = import_single_range(direction, address, size, iovec, &iter);
does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets
silently truncated.
Next commits will be switching the things up to use of those and reducing
the amount of iov_iter_init() instances.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-03-21 21:45:43 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(import_iovec);
|
|
|
|
|
2023-01-05 19:07:30 +00:00
|
|
|
int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
|
|
|
|
{
|
|
|
|
if (len > MAX_RW_COUNT)
|
|
|
|
len = MAX_RW_COUNT;
|
|
|
|
if (unlikely(!access_ok(buf, len)))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
iov_iter_ubuf(i, rw, buf, len);
|
|
|
|
return 0;
|
|
|
|
}
|
2023-08-15 19:01:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(import_ubuf);
|
2023-01-05 19:07:30 +00:00
|
|
|
|
2021-09-10 17:18:36 +00:00
|
|
|
/**
|
|
|
|
* iov_iter_restore() - Restore a &struct iov_iter to the same state as when
|
|
|
|
* iov_iter_save_state() was called.
|
|
|
|
*
|
|
|
|
* @i: &struct iov_iter to restore
|
|
|
|
* @state: state to restore from
|
|
|
|
*
|
|
|
|
* Used after iov_iter_save_state() to bring restore @i, if operations may
|
|
|
|
* have advanced it.
|
|
|
|
*
|
|
|
|
* Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
|
|
|
|
*/
|
|
|
|
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
|
|
|
|
{
|
2023-01-05 19:07:33 +00:00
|
|
|
if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
|
|
|
|
!iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
|
2021-09-10 17:18:36 +00:00
|
|
|
return;
|
|
|
|
i->iov_offset = state->iov_offset;
|
|
|
|
i->count = state->count;
|
2022-05-22 18:59:25 +00:00
|
|
|
if (iter_is_ubuf(i))
|
|
|
|
return;
|
2021-09-10 17:18:36 +00:00
|
|
|
/*
|
|
|
|
* For the *vec iters, nr_segs + iov is constant - if we increment
|
|
|
|
* the vec, then we also decrement the nr_segs count. Hence we don't
|
|
|
|
* need to track both of these, just one is enough and we can deduct
|
|
|
|
* the other from that. ITER_KVEC and ITER_IOVEC are the same struct
|
|
|
|
* size, so we can just increment the iov pointer as they are unionzed.
|
|
|
|
* ITER_BVEC _may_ be the same size on some archs, but on others it is
|
|
|
|
* not. Be safe and handle it separately.
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
|
|
|
|
if (iov_iter_is_bvec(i))
|
|
|
|
i->bvec -= state->nr_segs - i->nr_segs;
|
|
|
|
else
|
2023-03-29 14:52:15 +00:00
|
|
|
i->__iov -= state->nr_segs - i->nr_segs;
|
2021-09-10 17:18:36 +00:00
|
|
|
i->nr_segs = state->nr_segs;
|
|
|
|
}
|
2022-10-28 20:50:30 +00:00
|
|
|
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
/*
|
|
|
|
* Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does
|
|
|
|
* not get references on the pages, nor does it get a pin on them.
|
|
|
|
*/
|
|
|
|
static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i,
|
|
|
|
struct page ***pages, size_t maxsize,
|
|
|
|
unsigned int maxpages,
|
|
|
|
iov_iter_extraction_t extraction_flags,
|
|
|
|
size_t *offset0)
|
|
|
|
{
|
|
|
|
const struct folio_queue *folioq = i->folioq;
|
|
|
|
struct page **p;
|
|
|
|
unsigned int nr = 0;
|
|
|
|
size_t extracted = 0, offset, slot = i->folioq_slot;
|
|
|
|
|
|
|
|
if (slot >= folioq_nr_slots(folioq)) {
|
|
|
|
folioq = folioq->next;
|
|
|
|
slot = 0;
|
|
|
|
if (WARN_ON(i->iov_offset != 0))
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
offset = i->iov_offset & ~PAGE_MASK;
|
|
|
|
*offset0 = offset;
|
|
|
|
|
|
|
|
maxpages = want_pages_array(pages, maxsize, offset, maxpages);
|
|
|
|
if (!maxpages)
|
|
|
|
return -ENOMEM;
|
|
|
|
p = *pages;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
struct folio *folio = folioq_folio(folioq, slot);
|
|
|
|
size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot);
|
|
|
|
size_t part = PAGE_SIZE - offset % PAGE_SIZE;
|
|
|
|
|
|
|
|
if (offset < fsize) {
|
|
|
|
part = umin(part, umin(maxsize - extracted, fsize - offset));
|
|
|
|
i->count -= part;
|
|
|
|
i->iov_offset += part;
|
|
|
|
extracted += part;
|
|
|
|
|
|
|
|
p[nr++] = folio_page(folio, offset / PAGE_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nr >= maxpages || extracted >= maxsize)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (i->iov_offset >= fsize) {
|
|
|
|
i->iov_offset = 0;
|
|
|
|
slot++;
|
|
|
|
if (slot == folioq_nr_slots(folioq) && folioq->next) {
|
|
|
|
folioq = folioq->next;
|
|
|
|
slot = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
i->folioq = folioq;
|
|
|
|
i->folioq_slot = slot;
|
|
|
|
return extracted;
|
|
|
|
}
|
|
|
|
|
2022-10-28 20:50:30 +00:00
|
|
|
/*
|
|
|
|
* Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not
|
|
|
|
* get references on the pages, nor does it get a pin on them.
|
|
|
|
*/
|
|
|
|
static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
|
|
|
|
struct page ***pages, size_t maxsize,
|
|
|
|
unsigned int maxpages,
|
|
|
|
iov_iter_extraction_t extraction_flags,
|
|
|
|
size_t *offset0)
|
|
|
|
{
|
|
|
|
struct page *page, **p;
|
|
|
|
unsigned int nr = 0, offset;
|
|
|
|
loff_t pos = i->xarray_start + i->iov_offset;
|
|
|
|
pgoff_t index = pos >> PAGE_SHIFT;
|
|
|
|
XA_STATE(xas, i->xarray, index);
|
|
|
|
|
|
|
|
offset = pos & ~PAGE_MASK;
|
|
|
|
*offset0 = offset;
|
|
|
|
|
|
|
|
maxpages = want_pages_array(pages, maxsize, offset, maxpages);
|
|
|
|
if (!maxpages)
|
|
|
|
return -ENOMEM;
|
|
|
|
p = *pages;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
for (page = xas_load(&xas); page; page = xas_next(&xas)) {
|
|
|
|
if (xas_retry(&xas, page))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Has the page moved or been split? */
|
|
|
|
if (unlikely(page != xas_reload(&xas))) {
|
|
|
|
xas_reset(&xas);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
p[nr++] = find_subpage(page, xas.xa_index);
|
|
|
|
if (nr == maxpages)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
|
|
|
|
iov_iter_advance(i, maxsize);
|
|
|
|
return maxsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Extract a list of contiguous pages from an ITER_BVEC iterator. This does
|
|
|
|
* not get references on the pages, nor does it get a pin on them.
|
|
|
|
*/
|
|
|
|
static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
|
|
|
|
struct page ***pages, size_t maxsize,
|
|
|
|
unsigned int maxpages,
|
|
|
|
iov_iter_extraction_t extraction_flags,
|
|
|
|
size_t *offset0)
|
|
|
|
{
|
|
|
|
struct page **p, *page;
|
2023-09-08 16:03:20 +00:00
|
|
|
size_t skip = i->iov_offset, offset, size;
|
2022-10-28 20:50:30 +00:00
|
|
|
int k;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
if (i->nr_segs == 0)
|
|
|
|
return 0;
|
2023-09-08 16:03:20 +00:00
|
|
|
size = min(maxsize, i->bvec->bv_len - skip);
|
|
|
|
if (size)
|
2022-10-28 20:50:30 +00:00
|
|
|
break;
|
|
|
|
i->iov_offset = 0;
|
|
|
|
i->nr_segs--;
|
|
|
|
i->bvec++;
|
|
|
|
skip = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
skip += i->bvec->bv_offset;
|
|
|
|
page = i->bvec->bv_page + skip / PAGE_SIZE;
|
|
|
|
offset = skip % PAGE_SIZE;
|
|
|
|
*offset0 = offset;
|
|
|
|
|
2023-09-08 16:03:20 +00:00
|
|
|
maxpages = want_pages_array(pages, size, offset, maxpages);
|
2022-10-28 20:50:30 +00:00
|
|
|
if (!maxpages)
|
|
|
|
return -ENOMEM;
|
|
|
|
p = *pages;
|
|
|
|
for (k = 0; k < maxpages; k++)
|
|
|
|
p[k] = page + k;
|
|
|
|
|
2023-09-08 16:03:20 +00:00
|
|
|
size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
|
|
|
|
iov_iter_advance(i, size);
|
|
|
|
return size;
|
2022-10-28 20:50:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Extract a list of virtually contiguous pages from an ITER_KVEC iterator.
|
|
|
|
* This does not get references on the pages, nor does it get a pin on them.
|
|
|
|
*/
|
|
|
|
static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i,
|
|
|
|
struct page ***pages, size_t maxsize,
|
|
|
|
unsigned int maxpages,
|
|
|
|
iov_iter_extraction_t extraction_flags,
|
|
|
|
size_t *offset0)
|
|
|
|
{
|
|
|
|
struct page **p, *page;
|
|
|
|
const void *kaddr;
|
2023-09-08 16:03:20 +00:00
|
|
|
size_t skip = i->iov_offset, offset, len, size;
|
2022-10-28 20:50:30 +00:00
|
|
|
int k;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
if (i->nr_segs == 0)
|
|
|
|
return 0;
|
2023-09-08 16:03:20 +00:00
|
|
|
size = min(maxsize, i->kvec->iov_len - skip);
|
|
|
|
if (size)
|
2022-10-28 20:50:30 +00:00
|
|
|
break;
|
|
|
|
i->iov_offset = 0;
|
|
|
|
i->nr_segs--;
|
|
|
|
i->kvec++;
|
|
|
|
skip = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
kaddr = i->kvec->iov_base + skip;
|
|
|
|
offset = (unsigned long)kaddr & ~PAGE_MASK;
|
|
|
|
*offset0 = offset;
|
|
|
|
|
2023-09-08 16:03:20 +00:00
|
|
|
maxpages = want_pages_array(pages, size, offset, maxpages);
|
2022-10-28 20:50:30 +00:00
|
|
|
if (!maxpages)
|
|
|
|
return -ENOMEM;
|
|
|
|
p = *pages;
|
|
|
|
|
|
|
|
kaddr -= offset;
|
2023-09-08 16:03:20 +00:00
|
|
|
len = offset + size;
|
2022-10-28 20:50:30 +00:00
|
|
|
for (k = 0; k < maxpages; k++) {
|
|
|
|
size_t seg = min_t(size_t, len, PAGE_SIZE);
|
|
|
|
|
|
|
|
if (is_vmalloc_or_module_addr(kaddr))
|
|
|
|
page = vmalloc_to_page(kaddr);
|
|
|
|
else
|
|
|
|
page = virt_to_page(kaddr);
|
|
|
|
|
|
|
|
p[k] = page;
|
|
|
|
len -= seg;
|
|
|
|
kaddr += PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
2023-09-08 16:03:20 +00:00
|
|
|
size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
|
|
|
|
iov_iter_advance(i, size);
|
|
|
|
return size;
|
2022-10-28 20:50:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Extract a list of contiguous pages from a user iterator and get a pin on
|
|
|
|
* each of them. This should only be used if the iterator is user-backed
|
|
|
|
* (IOBUF/UBUF).
|
|
|
|
*
|
|
|
|
* It does not get refs on the pages, but the pages must be unpinned by the
|
|
|
|
* caller once the transfer is complete.
|
|
|
|
*
|
|
|
|
* This is safe to be used where background IO/DMA *is* going to be modifying
|
|
|
|
* the buffer; using a pin rather than a ref makes forces fork() to give the
|
|
|
|
* child a copy of the page.
|
|
|
|
*/
|
|
|
|
static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
|
|
|
|
struct page ***pages,
|
|
|
|
size_t maxsize,
|
|
|
|
unsigned int maxpages,
|
|
|
|
iov_iter_extraction_t extraction_flags,
|
|
|
|
size_t *offset0)
|
|
|
|
{
|
|
|
|
unsigned long addr;
|
|
|
|
unsigned int gup_flags = 0;
|
|
|
|
size_t offset;
|
|
|
|
int res;
|
|
|
|
|
|
|
|
if (i->data_source == ITER_DEST)
|
|
|
|
gup_flags |= FOLL_WRITE;
|
|
|
|
if (extraction_flags & ITER_ALLOW_P2PDMA)
|
|
|
|
gup_flags |= FOLL_PCI_P2PDMA;
|
|
|
|
if (i->nofault)
|
|
|
|
gup_flags |= FOLL_NOFAULT;
|
|
|
|
|
|
|
|
addr = first_iovec_segment(i, &maxsize);
|
|
|
|
*offset0 = offset = addr % PAGE_SIZE;
|
|
|
|
addr &= PAGE_MASK;
|
|
|
|
maxpages = want_pages_array(pages, maxsize, offset, maxpages);
|
|
|
|
if (!maxpages)
|
|
|
|
return -ENOMEM;
|
|
|
|
res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
|
|
|
|
if (unlikely(res <= 0))
|
|
|
|
return res;
|
|
|
|
maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
|
|
|
|
iov_iter_advance(i, maxsize);
|
|
|
|
return maxsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
|
|
|
|
* @i: The iterator to extract from
|
|
|
|
* @pages: Where to return the list of pages
|
|
|
|
* @maxsize: The maximum amount of iterator to extract
|
|
|
|
* @maxpages: The maximum size of the list of pages
|
|
|
|
* @extraction_flags: Flags to qualify request
|
|
|
|
* @offset0: Where to return the starting offset into (*@pages)[0]
|
|
|
|
*
|
|
|
|
* Extract a list of contiguous pages from the current point of the iterator,
|
|
|
|
* advancing the iterator. The maximum number of pages and the maximum amount
|
|
|
|
* of page contents can be set.
|
|
|
|
*
|
|
|
|
* If *@pages is NULL, a page list will be allocated to the required size and
|
|
|
|
* *@pages will be set to its base. If *@pages is not NULL, it will be assumed
|
|
|
|
* that the caller allocated a page list at least @maxpages in size and this
|
|
|
|
* will be filled in.
|
|
|
|
*
|
|
|
|
* @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
|
|
|
|
* be allowed on the pages extracted.
|
|
|
|
*
|
|
|
|
* The iov_iter_extract_will_pin() function can be used to query how cleanup
|
|
|
|
* should be performed.
|
|
|
|
*
|
|
|
|
* Extra refs or pins on the pages may be obtained as follows:
|
|
|
|
*
|
|
|
|
* (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be
|
|
|
|
* added to the pages, but refs will not be taken.
|
|
|
|
* iov_iter_extract_will_pin() will return true.
|
|
|
|
*
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
* (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the
|
|
|
|
* pages are merely listed; no extra refs or pins are obtained.
|
2022-10-28 20:50:30 +00:00
|
|
|
* iov_iter_extract_will_pin() will return 0.
|
|
|
|
*
|
|
|
|
* Note also:
|
|
|
|
*
|
|
|
|
* (*) Use with ITER_DISCARD is not supported as that has no content.
|
|
|
|
*
|
|
|
|
* On success, the function sets *@pages to the new pagelist, if allocated, and
|
|
|
|
* sets *offset0 to the offset into the first page.
|
|
|
|
*
|
|
|
|
* It may also return -ENOMEM and -EFAULT.
|
|
|
|
*/
|
|
|
|
ssize_t iov_iter_extract_pages(struct iov_iter *i,
|
|
|
|
struct page ***pages,
|
|
|
|
size_t maxsize,
|
|
|
|
unsigned int maxpages,
|
|
|
|
iov_iter_extraction_t extraction_flags,
|
|
|
|
size_t *offset0)
|
|
|
|
{
|
|
|
|
maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT);
|
|
|
|
if (!maxsize)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (likely(user_backed_iter(i)))
|
|
|
|
return iov_iter_extract_user_pages(i, pages, maxsize,
|
|
|
|
maxpages, extraction_flags,
|
|
|
|
offset0);
|
|
|
|
if (iov_iter_is_kvec(i))
|
|
|
|
return iov_iter_extract_kvec_pages(i, pages, maxsize,
|
|
|
|
maxpages, extraction_flags,
|
|
|
|
offset0);
|
|
|
|
if (iov_iter_is_bvec(i))
|
|
|
|
return iov_iter_extract_bvec_pages(i, pages, maxsize,
|
|
|
|
maxpages, extraction_flags,
|
|
|
|
offset0);
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-18 23:20:42 +00:00
|
|
|
if (iov_iter_is_folioq(i))
|
|
|
|
return iov_iter_extract_folioq_pages(i, pages, maxsize,
|
|
|
|
maxpages, extraction_flags,
|
|
|
|
offset0);
|
2022-10-28 20:50:30 +00:00
|
|
|
if (iov_iter_is_xarray(i))
|
|
|
|
return iov_iter_extract_xarray_pages(i, pages, maxsize,
|
|
|
|
maxpages, extraction_flags,
|
|
|
|
offset0);
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
|