mirror of
https://github.com/torvalds/linux.git
synced 2024-11-25 05:32:00 +00:00
7f053812da
Depending on the architecture, building a 32-bit vDSO on a 64-bit kernel is problematic when some system headers are included. Minimise the amount of headers by moving needed items, such as __{get,put}_unaligned_t, into dedicated common headers and in general use more specific headers, similar to what was done in commit8165b57bca
("linux/const.h: Extract common header for vDSO") and commit8c59ab839f
("lib/vdso: Enable common headers"). On some architectures this results in missing PAGE_SIZE, as was described by commit8b3843ae36
("vdso/datapage: Quick fix - use asm/page-def.h for ARM64"), so define this if necessary, in the same way as done prior by commitcffaefd15a
("vdso: Use CONFIG_PAGE_SHIFT in vdso/datapage.h"). Removing linux/time64.h leads to missing 'struct timespec64' in x86's asm/pvclock.h. Add a forward declaration of that struct in that file. Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
261 lines
9.6 KiB
C
261 lines
9.6 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
*/
|
|
|
|
#include <linux/array_size.h>
|
|
#include <linux/minmax.h>
|
|
#include <vdso/datapage.h>
|
|
#include <vdso/getrandom.h>
|
|
#include <vdso/unaligned.h>
|
|
#include <asm/vdso/getrandom.h>
|
|
#include <uapi/linux/mman.h>
|
|
#include <uapi/linux/random.h>
|
|
|
|
#undef PAGE_SIZE
|
|
#undef PAGE_MASK
|
|
#define PAGE_SIZE (1UL << CONFIG_PAGE_SHIFT)
|
|
#define PAGE_MASK (~(PAGE_SIZE - 1))
|
|
|
|
#define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \
|
|
while (len >= sizeof(type)) { \
|
|
__put_unaligned_t(type, __get_unaligned_t(type, src), dst); \
|
|
__put_unaligned_t(type, 0, src); \
|
|
dst += sizeof(type); \
|
|
src += sizeof(type); \
|
|
len -= sizeof(type); \
|
|
} \
|
|
} while (0)
|
|
|
|
static void memcpy_and_zero_src(void *dst, void *src, size_t len)
|
|
{
|
|
if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
|
|
if (IS_ENABLED(CONFIG_64BIT))
|
|
MEMCPY_AND_ZERO_SRC(u64, dst, src, len);
|
|
MEMCPY_AND_ZERO_SRC(u32, dst, src, len);
|
|
MEMCPY_AND_ZERO_SRC(u16, dst, src, len);
|
|
}
|
|
MEMCPY_AND_ZERO_SRC(u8, dst, src, len);
|
|
}
|
|
|
|
/**
|
|
* __cvdso_getrandom_data - Generic vDSO implementation of getrandom() syscall.
|
|
* @rng_info: Describes state of kernel RNG, memory shared with kernel.
|
|
* @buffer: Destination buffer to fill with random bytes.
|
|
* @len: Size of @buffer in bytes.
|
|
* @flags: Zero or more GRND_* flags.
|
|
* @opaque_state: Pointer to an opaque state area.
|
|
* @opaque_len: Length of opaque state area.
|
|
*
|
|
* This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's
|
|
* getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same
|
|
* schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always
|
|
* calls into the syscall.
|
|
*
|
|
* If @buffer, @len, and @flags are 0, and @opaque_len is ~0UL, then @opaque_state is populated
|
|
* with a struct vgetrandom_opaque_params and the function returns 0; if it does not return 0,
|
|
* this function should not be used.
|
|
*
|
|
* @opaque_state *must* be allocated by calling mmap(2) using the mmap_prot and mmap_flags fields
|
|
* from the struct vgetrandom_opaque_params, and states must not straddle pages. Unless external
|
|
* locking is used, one state must be allocated per thread, as it is not safe to call this function
|
|
* concurrently with the same @opaque_state. However, it is safe to call this using the same
|
|
* @opaque_state that is shared between main code and signal handling code, within the same thread.
|
|
*
|
|
* Returns: The number of random bytes written to @buffer, or a negative value indicating an error.
|
|
*/
|
|
static __always_inline ssize_t
|
|
__cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_t len,
|
|
unsigned int flags, void *opaque_state, size_t opaque_len)
|
|
{
|
|
ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len);
|
|
struct vgetrandom_state *state = opaque_state;
|
|
size_t batch_len, nblocks, orig_len = len;
|
|
bool in_use, have_retried = false;
|
|
void *orig_buffer = buffer;
|
|
u64 current_generation;
|
|
u32 counter[2] = { 0 };
|
|
|
|
if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) {
|
|
struct vgetrandom_opaque_params *params = opaque_state;
|
|
params->size_of_opaque_state = sizeof(*state);
|
|
params->mmap_prot = PROT_READ | PROT_WRITE;
|
|
params->mmap_flags = MAP_DROPPABLE | MAP_ANONYMOUS;
|
|
for (size_t i = 0; i < ARRAY_SIZE(params->reserved); ++i)
|
|
params->reserved[i] = 0;
|
|
return 0;
|
|
}
|
|
|
|
/* The state must not straddle a page, since pages can be zeroed at any time. */
|
|
if (unlikely(((unsigned long)opaque_state & ~PAGE_MASK) + sizeof(*state) > PAGE_SIZE))
|
|
return -EFAULT;
|
|
|
|
/* Handle unexpected flags by falling back to the kernel. */
|
|
if (unlikely(flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE)))
|
|
goto fallback_syscall;
|
|
|
|
/* If the caller passes the wrong size, which might happen due to CRIU, fallback. */
|
|
if (unlikely(opaque_len != sizeof(*state)))
|
|
goto fallback_syscall;
|
|
|
|
/*
|
|
* If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from
|
|
* userspace, because A) the various @flags require this to block, or not, depending on
|
|
* various factors unavailable to userspace, and B) the kernel's behavior before the RNG is
|
|
* ready is to reseed from the entropy pool at every invocation.
|
|
*/
|
|
if (unlikely(!READ_ONCE(rng_info->is_ready)))
|
|
goto fallback_syscall;
|
|
|
|
/*
|
|
* This condition is checked after @rng_info->is_ready, because before the kernel's RNG is
|
|
* initialized, the @flags parameter may require this to block or return an error, even when
|
|
* len is zero.
|
|
*/
|
|
if (unlikely(!len))
|
|
return 0;
|
|
|
|
/*
|
|
* @state->in_use is basic reentrancy protection against this running in a signal handler
|
|
* with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one
|
|
* level of reentrancy. If a signal interrupts this after reading @state->in_use, but before
|
|
* writing @state->in_use, there is still no race, because the signal handler will run to
|
|
* its completion before returning execution.
|
|
*/
|
|
in_use = READ_ONCE(state->in_use);
|
|
if (unlikely(in_use))
|
|
/* The syscall simply fills the buffer and does not touch @state, so fallback. */
|
|
goto fallback_syscall;
|
|
WRITE_ONCE(state->in_use, true);
|
|
|
|
retry_generation:
|
|
/*
|
|
* @rng_info->generation must always be read here, as it serializes @state->key with the
|
|
* kernel's RNG reseeding schedule.
|
|
*/
|
|
current_generation = READ_ONCE(rng_info->generation);
|
|
|
|
/*
|
|
* If @state->generation doesn't match the kernel RNG's generation, then it means the
|
|
* kernel's RNG has reseeded, and so @state->key is reseeded as well.
|
|
*/
|
|
if (unlikely(state->generation != current_generation)) {
|
|
/*
|
|
* Write the generation before filling the key, in case of fork. If there is a fork
|
|
* just after this line, the parent and child will get different random bytes from
|
|
* the syscall, which is good. However, were this line to occur after the getrandom
|
|
* syscall, then both child and parent could have the same bytes and the same
|
|
* generation counter, so the fork would not be detected. Therefore, write
|
|
* @state->generation before the call to the getrandom syscall.
|
|
*/
|
|
WRITE_ONCE(state->generation, current_generation);
|
|
|
|
/*
|
|
* Prevent the syscall from being reordered wrt current_generation. Pairs with the
|
|
* smp_store_release(&_vdso_rng_data.generation) in random.c.
|
|
*/
|
|
smp_rmb();
|
|
|
|
/* Reseed @state->key using fresh bytes from the kernel. */
|
|
if (getrandom_syscall(state->key, sizeof(state->key), 0) != sizeof(state->key)) {
|
|
/*
|
|
* If the syscall failed to refresh the key, then @state->key is now
|
|
* invalid, so invalidate the generation so that it is not used again, and
|
|
* fallback to using the syscall entirely.
|
|
*/
|
|
WRITE_ONCE(state->generation, 0);
|
|
|
|
/*
|
|
* Set @state->in_use to false only after the last write to @state in the
|
|
* line above.
|
|
*/
|
|
WRITE_ONCE(state->in_use, false);
|
|
|
|
goto fallback_syscall;
|
|
}
|
|
|
|
/*
|
|
* Set @state->pos to beyond the end of the batch, so that the batch is refilled
|
|
* using the new key.
|
|
*/
|
|
state->pos = sizeof(state->batch);
|
|
}
|
|
|
|
/* Set len to the total amount of bytes that this function is allowed to read, ret. */
|
|
len = ret;
|
|
more_batch:
|
|
/*
|
|
* First use bytes out of @state->batch, which may have been filled by the last call to this
|
|
* function.
|
|
*/
|
|
batch_len = min_t(size_t, sizeof(state->batch) - state->pos, len);
|
|
if (batch_len) {
|
|
/* Zeroing at the same time as memcpying helps preserve forward secrecy. */
|
|
memcpy_and_zero_src(buffer, state->batch + state->pos, batch_len);
|
|
state->pos += batch_len;
|
|
buffer += batch_len;
|
|
len -= batch_len;
|
|
}
|
|
|
|
if (!len) {
|
|
/* Prevent the loop from being reordered wrt ->generation. */
|
|
barrier();
|
|
|
|
/*
|
|
* Since @rng_info->generation will never be 0, re-read @state->generation, rather
|
|
* than using the local current_generation variable, to learn whether a fork
|
|
* occurred or if @state was zeroed due to memory pressure. Primarily, though, this
|
|
* indicates whether the kernel's RNG has reseeded, in which case generate a new key
|
|
* and start over.
|
|
*/
|
|
if (unlikely(READ_ONCE(state->generation) != READ_ONCE(rng_info->generation))) {
|
|
/*
|
|
* Prevent this from looping forever in case of low memory or racing with a
|
|
* user force-reseeding the kernel's RNG using the ioctl.
|
|
*/
|
|
if (have_retried) {
|
|
WRITE_ONCE(state->in_use, false);
|
|
goto fallback_syscall;
|
|
}
|
|
|
|
have_retried = true;
|
|
buffer = orig_buffer;
|
|
goto retry_generation;
|
|
}
|
|
|
|
/*
|
|
* Set @state->in_use to false only when there will be no more reads or writes of
|
|
* @state.
|
|
*/
|
|
WRITE_ONCE(state->in_use, false);
|
|
return ret;
|
|
}
|
|
|
|
/* Generate blocks of RNG output directly into @buffer while there's enough room left. */
|
|
nblocks = len / CHACHA_BLOCK_SIZE;
|
|
if (nblocks) {
|
|
__arch_chacha20_blocks_nostack(buffer, state->key, counter, nblocks);
|
|
buffer += nblocks * CHACHA_BLOCK_SIZE;
|
|
len -= nblocks * CHACHA_BLOCK_SIZE;
|
|
}
|
|
|
|
BUILD_BUG_ON(sizeof(state->batch_key) % CHACHA_BLOCK_SIZE != 0);
|
|
|
|
/* Refill the batch and overwrite the key, in order to preserve forward secrecy. */
|
|
__arch_chacha20_blocks_nostack(state->batch_key, state->key, counter,
|
|
sizeof(state->batch_key) / CHACHA_BLOCK_SIZE);
|
|
|
|
/* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */
|
|
state->pos = 0;
|
|
goto more_batch;
|
|
|
|
fallback_syscall:
|
|
return getrandom_syscall(orig_buffer, orig_len, flags);
|
|
}
|
|
|
|
static __always_inline ssize_t
|
|
__cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
|
|
{
|
|
return __cvdso_getrandom_data(__arch_get_vdso_rng_data(), buffer, len, flags, opaque_state, opaque_len);
|
|
}
|