457f218095
The current uaccess code uses a page table walk in some circumstances, e.g. in case of the in atomic futex operations or if running on old hardware which doesn't support the mvcos instruction. However it turned out that the page table walk code does not correctly lock page tables when accessing page table entries. In other words: a different cpu may invalidate a page table entry while the current cpu inspects the pte. This may lead to random data corruption. Adding correct locking however isn't trivial for all uaccess operations. Especially copy_in_user() is problematic since that requires to hold at least two locks, but must be protected against ABBA deadlock when a different cpu also performs a copy_in_user() operation. So the solution is a different approach where we change address spaces: User space runs in primary address mode, or access register mode within vdso code, like it currently already does. The kernel usually also runs in home space mode, however when accessing user space the kernel switches to primary or secondary address mode if the mvcos instruction is not available or if a compare-and-swap (futex) instruction on a user space address is performed. KVM however is special, since that requires the kernel to run in home address space while implicitly accessing user space with the sie instruction. So we end up with: User space: - runs in primary or access register mode - cr1 contains the user asce - cr7 contains the user asce - cr13 contains the kernel asce Kernel space: - runs in home space mode - cr1 contains the user or kernel asce -> the kernel asce is loaded when a uaccess requires primary or secondary address mode - cr7 contains the user or kernel asce, (changed with set_fs()) - cr13 contains the kernel asce In case of uaccess the kernel changes to: - primary space mode in case of a uaccess (copy_to_user) and uses e.g. the mvcp instruction to access user space. However the kernel will stay in home space mode if the mvcos instruction is available - secondary space mode in case of futex atomic operations, so that the instructions come from primary address space and data from secondary space In case of kvm the kernel runs in home space mode, but cr1 gets switched to contain the gmap asce before the sie instruction gets executed. When the sie instruction is finished cr1 will be switched back to contain the user asce. A context switch between two processes will always load the kernel asce for the next process in cr1. So the first exit to user space is a bit more expensive (one extra load control register instruction) than before, however keeps the code rather simple. In sum this means there is no need to perform any error prone page table walks anymore when accessing user space. The patch seems to be rather large, however it mainly removes the the page table walk code and restores the previously deleted "standard" uaccess code, with a couple of changes. The uaccess without mvcos mode can be enforced with the "uaccess_primary" kernel parameter. Reported-by: Christian Borntraeger <borntraeger@de.ibm.com> Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
347 lines
9.0 KiB
C
347 lines
9.0 KiB
C
/*
|
|
* S390 version
|
|
* Copyright IBM Corp. 1999, 2000
|
|
* Author(s): Hartmut Penner (hp@de.ibm.com),
|
|
* Martin Schwidefsky (schwidefsky@de.ibm.com)
|
|
*
|
|
* Derived from "include/asm-i386/uaccess.h"
|
|
*/
|
|
#ifndef __S390_UACCESS_H
|
|
#define __S390_UACCESS_H
|
|
|
|
/*
|
|
* User space memory access functions
|
|
*/
|
|
#include <linux/sched.h>
|
|
#include <linux/errno.h>
|
|
#include <asm/ctl_reg.h>
|
|
|
|
#define VERIFY_READ 0
|
|
#define VERIFY_WRITE 1
|
|
|
|
|
|
/*
|
|
* The fs value determines whether argument validity checking should be
|
|
* performed or not. If get_fs() == USER_DS, checking is performed, with
|
|
* get_fs() == KERNEL_DS, checking is bypassed.
|
|
*
|
|
* For historical reasons, these macros are grossly misnamed.
|
|
*/
|
|
|
|
#define MAKE_MM_SEG(a) ((mm_segment_t) { (a) })
|
|
|
|
|
|
#define KERNEL_DS MAKE_MM_SEG(0)
|
|
#define USER_DS MAKE_MM_SEG(1)
|
|
|
|
#define get_ds() (KERNEL_DS)
|
|
#define get_fs() (current->thread.mm_segment)
|
|
|
|
#define set_fs(x) \
|
|
({ \
|
|
unsigned long __pto; \
|
|
current->thread.mm_segment = (x); \
|
|
__pto = current->thread.mm_segment.ar4 ? \
|
|
S390_lowcore.user_asce : S390_lowcore.kernel_asce; \
|
|
__ctl_load(__pto, 7, 7); \
|
|
})
|
|
|
|
#define segment_eq(a,b) ((a).ar4 == (b).ar4)
|
|
|
|
static inline int __range_ok(unsigned long addr, unsigned long size)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
#define __access_ok(addr, size) \
|
|
({ \
|
|
__chk_user_ptr(addr); \
|
|
__range_ok((unsigned long)(addr), (size)); \
|
|
})
|
|
|
|
#define access_ok(type, addr, size) __access_ok(addr, size)
|
|
|
|
/*
|
|
* The exception table consists of pairs of addresses: the first is the
|
|
* address of an instruction that is allowed to fault, and the second is
|
|
* the address at which the program should continue. No registers are
|
|
* modified, so it is entirely up to the continuation code to figure out
|
|
* what to do.
|
|
*
|
|
* All the routines below use bits of fixup code that are out of line
|
|
* with the main instruction path. This means when everything is well,
|
|
* we don't even have to jump over them. Further, they do not intrude
|
|
* on our cache or tlb entries.
|
|
*/
|
|
|
|
struct exception_table_entry
|
|
{
|
|
int insn, fixup;
|
|
};
|
|
|
|
static inline unsigned long extable_insn(const struct exception_table_entry *x)
|
|
{
|
|
return (unsigned long)&x->insn + x->insn;
|
|
}
|
|
|
|
static inline unsigned long extable_fixup(const struct exception_table_entry *x)
|
|
{
|
|
return (unsigned long)&x->fixup + x->fixup;
|
|
}
|
|
|
|
#define ARCH_HAS_SORT_EXTABLE
|
|
#define ARCH_HAS_SEARCH_EXTABLE
|
|
|
|
/**
|
|
* __copy_from_user: - Copy a block of data from user space, with less checking.
|
|
* @to: Destination address, in kernel space.
|
|
* @from: Source address, in user space.
|
|
* @n: Number of bytes to copy.
|
|
*
|
|
* Context: User context only. This function may sleep.
|
|
*
|
|
* Copy data from user space to kernel space. Caller must check
|
|
* the specified block with access_ok() before calling this function.
|
|
*
|
|
* Returns number of bytes that could not be copied.
|
|
* On success, this will be zero.
|
|
*
|
|
* If some data could not be copied, this function will pad the copied
|
|
* data to the requested size using zero bytes.
|
|
*/
|
|
unsigned long __must_check __copy_from_user(void *to, const void __user *from,
|
|
unsigned long n);
|
|
|
|
/**
|
|
* __copy_to_user: - Copy a block of data into user space, with less checking.
|
|
* @to: Destination address, in user space.
|
|
* @from: Source address, in kernel space.
|
|
* @n: Number of bytes to copy.
|
|
*
|
|
* Context: User context only. This function may sleep.
|
|
*
|
|
* Copy data from kernel space to user space. Caller must check
|
|
* the specified block with access_ok() before calling this function.
|
|
*
|
|
* Returns number of bytes that could not be copied.
|
|
* On success, this will be zero.
|
|
*/
|
|
unsigned long __must_check __copy_to_user(void __user *to, const void *from,
|
|
unsigned long n);
|
|
|
|
#define __copy_to_user_inatomic __copy_to_user
|
|
#define __copy_from_user_inatomic __copy_from_user
|
|
|
|
static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
|
|
{
|
|
size = __copy_to_user(ptr, x, size);
|
|
return size ? -EFAULT : 0;
|
|
}
|
|
|
|
static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
|
|
{
|
|
size = __copy_from_user(x, ptr, size);
|
|
return size ? -EFAULT : 0;
|
|
}
|
|
|
|
/*
|
|
* These are the main single-value transfer routines. They automatically
|
|
* use the right size if we just have the right pointer type.
|
|
*/
|
|
#define __put_user(x, ptr) \
|
|
({ \
|
|
__typeof__(*(ptr)) __x = (x); \
|
|
int __pu_err = -EFAULT; \
|
|
__chk_user_ptr(ptr); \
|
|
switch (sizeof (*(ptr))) { \
|
|
case 1: \
|
|
case 2: \
|
|
case 4: \
|
|
case 8: \
|
|
__pu_err = __put_user_fn(&__x, ptr, \
|
|
sizeof(*(ptr))); \
|
|
break; \
|
|
default: \
|
|
__put_user_bad(); \
|
|
break; \
|
|
} \
|
|
__pu_err; \
|
|
})
|
|
|
|
#define put_user(x, ptr) \
|
|
({ \
|
|
might_fault(); \
|
|
__put_user(x, ptr); \
|
|
})
|
|
|
|
|
|
int __put_user_bad(void) __attribute__((noreturn));
|
|
|
|
#define __get_user(x, ptr) \
|
|
({ \
|
|
int __gu_err = -EFAULT; \
|
|
__chk_user_ptr(ptr); \
|
|
switch (sizeof(*(ptr))) { \
|
|
case 1: { \
|
|
unsigned char __x; \
|
|
__gu_err = __get_user_fn(&__x, ptr, \
|
|
sizeof(*(ptr))); \
|
|
(x) = *(__force __typeof__(*(ptr)) *) &__x; \
|
|
break; \
|
|
}; \
|
|
case 2: { \
|
|
unsigned short __x; \
|
|
__gu_err = __get_user_fn(&__x, ptr, \
|
|
sizeof(*(ptr))); \
|
|
(x) = *(__force __typeof__(*(ptr)) *) &__x; \
|
|
break; \
|
|
}; \
|
|
case 4: { \
|
|
unsigned int __x; \
|
|
__gu_err = __get_user_fn(&__x, ptr, \
|
|
sizeof(*(ptr))); \
|
|
(x) = *(__force __typeof__(*(ptr)) *) &__x; \
|
|
break; \
|
|
}; \
|
|
case 8: { \
|
|
unsigned long long __x; \
|
|
__gu_err = __get_user_fn(&__x, ptr, \
|
|
sizeof(*(ptr))); \
|
|
(x) = *(__force __typeof__(*(ptr)) *) &__x; \
|
|
break; \
|
|
}; \
|
|
default: \
|
|
__get_user_bad(); \
|
|
break; \
|
|
} \
|
|
__gu_err; \
|
|
})
|
|
|
|
#define get_user(x, ptr) \
|
|
({ \
|
|
might_fault(); \
|
|
__get_user(x, ptr); \
|
|
})
|
|
|
|
int __get_user_bad(void) __attribute__((noreturn));
|
|
|
|
#define __put_user_unaligned __put_user
|
|
#define __get_user_unaligned __get_user
|
|
|
|
/**
|
|
* copy_to_user: - Copy a block of data into user space.
|
|
* @to: Destination address, in user space.
|
|
* @from: Source address, in kernel space.
|
|
* @n: Number of bytes to copy.
|
|
*
|
|
* Context: User context only. This function may sleep.
|
|
*
|
|
* Copy data from kernel space to user space.
|
|
*
|
|
* Returns number of bytes that could not be copied.
|
|
* On success, this will be zero.
|
|
*/
|
|
static inline unsigned long __must_check
|
|
copy_to_user(void __user *to, const void *from, unsigned long n)
|
|
{
|
|
might_fault();
|
|
return __copy_to_user(to, from, n);
|
|
}
|
|
|
|
void copy_from_user_overflow(void)
|
|
#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
|
|
__compiletime_warning("copy_from_user() buffer size is not provably correct")
|
|
#endif
|
|
;
|
|
|
|
/**
|
|
* copy_from_user: - Copy a block of data from user space.
|
|
* @to: Destination address, in kernel space.
|
|
* @from: Source address, in user space.
|
|
* @n: Number of bytes to copy.
|
|
*
|
|
* Context: User context only. This function may sleep.
|
|
*
|
|
* Copy data from user space to kernel space.
|
|
*
|
|
* Returns number of bytes that could not be copied.
|
|
* On success, this will be zero.
|
|
*
|
|
* If some data could not be copied, this function will pad the copied
|
|
* data to the requested size using zero bytes.
|
|
*/
|
|
static inline unsigned long __must_check
|
|
copy_from_user(void *to, const void __user *from, unsigned long n)
|
|
{
|
|
unsigned int sz = __compiletime_object_size(to);
|
|
|
|
might_fault();
|
|
if (unlikely(sz != -1 && sz < n)) {
|
|
copy_from_user_overflow();
|
|
return n;
|
|
}
|
|
return __copy_from_user(to, from, n);
|
|
}
|
|
|
|
unsigned long __must_check
|
|
__copy_in_user(void __user *to, const void __user *from, unsigned long n);
|
|
|
|
static inline unsigned long __must_check
|
|
copy_in_user(void __user *to, const void __user *from, unsigned long n)
|
|
{
|
|
might_fault();
|
|
return __copy_in_user(to, from, n);
|
|
}
|
|
|
|
/*
|
|
* Copy a null terminated string from userspace.
|
|
*/
|
|
|
|
long __strncpy_from_user(char *dst, const char __user *src, long count);
|
|
|
|
static inline long __must_check
|
|
strncpy_from_user(char *dst, const char __user *src, long count)
|
|
{
|
|
might_fault();
|
|
return __strncpy_from_user(dst, src, count);
|
|
}
|
|
|
|
unsigned long __must_check __strnlen_user(const char __user *src, unsigned long count);
|
|
|
|
static inline unsigned long strnlen_user(const char __user *src, unsigned long n)
|
|
{
|
|
might_fault();
|
|
return __strnlen_user(src, n);
|
|
}
|
|
|
|
/**
|
|
* strlen_user: - Get the size of a string in user space.
|
|
* @str: The string to measure.
|
|
*
|
|
* Context: User context only. This function may sleep.
|
|
*
|
|
* Get the size of a NUL-terminated string in user space.
|
|
*
|
|
* Returns the size of the string INCLUDING the terminating NUL.
|
|
* On exception, returns 0.
|
|
*
|
|
* If there is a limit on the length of a valid string, you may wish to
|
|
* consider using strnlen_user() instead.
|
|
*/
|
|
#define strlen_user(str) strnlen_user(str, ~0UL)
|
|
|
|
/*
|
|
* Zero Userspace
|
|
*/
|
|
unsigned long __must_check __clear_user(void __user *to, unsigned long size);
|
|
|
|
static inline unsigned long __must_check clear_user(void __user *to, unsigned long n)
|
|
{
|
|
might_fault();
|
|
return __clear_user(to, n);
|
|
}
|
|
|
|
int copy_to_user_real(void __user *dest, void *src, unsigned long count);
|
|
|
|
#endif /* __S390_UACCESS_H */
|