linux/arch/x86/lib/copy_user_uncached_64.S

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
 */

#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/asm.h>

/*
 * copy_user_nocache - Uncached memory copy with exception handling
 *
 * This copies from user space into kernel space, but the kernel
 * space accesses can take a machine check exception, so they too
 * need exception handling.
 *
 * Note: only 32-bit and 64-bit stores have non-temporal versions,
 * and we only use aligned versions. Any unaligned parts at the
 * start or end of the copy will be done using normal cached stores.
 *
 * Input:
 * rdi destination
 * rsi source
 * edx count
 *
 * Output:
 * rax uncopied bytes or 0 if successful.
 */
SYM_FUNC_START(__copy_user_nocache)
	/* If destination is not 7-byte aligned, we'll have to align it */
	testb $7,%dil
	jne .Lalign

.Lis_aligned:
	cmp $64,%edx
	jb .Lquadwords

	.p2align 4,0x90
.Lunrolled:
10:	movq (%rsi),%r8
11:	movq 8(%rsi),%r9
12:	movq 16(%rsi),%r10
13:	movq 24(%rsi),%r11
20:	movnti %r8,(%rdi)
21:	movnti %r9,8(%rdi)
22:	movnti %r10,16(%rdi)
23:	movnti %r11,24(%rdi)
30:	movq 32(%rsi),%r8
31:	movq 40(%rsi),%r9
32:	movq 48(%rsi),%r10
33:	movq 56(%rsi),%r11
40:	movnti %r8,32(%rdi)
41:	movnti %r9,40(%rdi)
42:	movnti %r10,48(%rdi)
43:	movnti %r11,56(%rdi)

	addq $64,%rsi
	addq $64,%rdi
	sub $64,%edx
	cmp $64,%edx
	jae .Lunrolled

/*
 * First set of user mode loads have been done
 * without any stores, so if they fail, we can
 * just try the non-unrolled loop.
 */
_ASM_EXTABLE_UA(10b, .Lquadwords)
_ASM_EXTABLE_UA(11b, .Lquadwords)
_ASM_EXTABLE_UA(12b, .Lquadwords)
_ASM_EXTABLE_UA(13b, .Lquadwords)

/*
 * The second set of user mode loads have been
 * done with 32 bytes stored to the destination,
 * so we need to take that into account before
 * falling back to the unrolled loop.
 */
_ASM_EXTABLE_UA(30b, .Lfixup32)
_ASM_EXTABLE_UA(31b, .Lfixup32)
_ASM_EXTABLE_UA(32b, .Lfixup32)
_ASM_EXTABLE_UA(33b, .Lfixup32)

/*
 * An exception on a write means that we're
 * done, but we need to update the count
 * depending on where in the unrolled loop
 * we were.
 */
_ASM_EXTABLE_UA(20b, .Ldone0)
_ASM_EXTABLE_UA(21b, .Ldone8)
_ASM_EXTABLE_UA(22b, .Ldone16)
_ASM_EXTABLE_UA(23b, .Ldone24)
_ASM_EXTABLE_UA(40b, .Ldone32)
_ASM_EXTABLE_UA(41b, .Ldone40)
_ASM_EXTABLE_UA(42b, .Ldone48)
_ASM_EXTABLE_UA(43b, .Ldone56)

.Lquadwords:
	cmp $8,%edx
	jb .Llong
50:	movq (%rsi),%rax
51:	movnti %rax,(%rdi)
	addq $8,%rsi
	addq $8,%rdi
	sub $8,%edx
	jmp .Lquadwords

/*
 * If we fail on the last full quadword, we will
 * not try to do any byte-wise cached accesses.
 * We will try to do one more 4-byte uncached
 * one, though.
 */
_ASM_EXTABLE_UA(50b, .Llast4)
_ASM_EXTABLE_UA(51b, .Ldone0)

.Llong:
	test $4,%dl
	je .Lword
60:	movl (%rsi),%eax
61:	movnti %eax,(%rdi)
	addq $4,%rsi
	addq $4,%rdi
	sub $4,%edx
.Lword:
	sfence
	test $2,%dl
	je .Lbyte
70:	movw (%rsi),%ax
71:	movw %ax,(%rdi)
	addq $2,%rsi
	addq $2,%rdi
	sub $2,%edx
.Lbyte:
	test $1,%dl
	je .Ldone
80:	movb (%rsi),%al
81:	movb %al,(%rdi)
	dec %edx
.Ldone:
	mov %edx,%eax
	RET

/*
 * If we fail on the last four bytes, we won't
 * bother with any fixups. It's dead, Jim. Note
 * that there's no need for 'sfence' for any
 * of this, since the exception will have been
 * serializing.
 */
_ASM_EXTABLE_UA(60b, .Ldone)
_ASM_EXTABLE_UA(61b, .Ldone)
_ASM_EXTABLE_UA(70b, .Ldone)
_ASM_EXTABLE_UA(71b, .Ldone)
_ASM_EXTABLE_UA(80b, .Ldone)
_ASM_EXTABLE_UA(81b, .Ldone)

/*
 * This is the "head needs aliging" case when
 * the destination isn't 8-byte aligned. The
 * 4-byte case can be done uncached, but any
 * smaller alignment is done with regular stores.
 */
.Lalign:
	test $1,%dil
	je .Lalign_word
	test %edx,%edx
	je .Ldone
90:	movb (%rsi),%al
91:	movb %al,(%rdi)
	inc %rsi
	inc %rdi
	dec %edx
.Lalign_word:
	test $2,%dil
	je .Lalign_long
	cmp $2,%edx
	jb .Lbyte
92:	movw (%rsi),%ax
93:	movw %ax,(%rdi)
	addq $2,%rsi
	addq $2,%rdi
	sub $2,%edx
.Lalign_long:
	test $4,%dil
	je .Lis_aligned
	cmp $4,%edx
	jb .Lword
94:	movl (%rsi),%eax
95:	movnti %eax,(%rdi)
	addq $4,%rsi
	addq $4,%rdi
	sub $4,%edx
	jmp .Lis_aligned

/*
 * If we fail on the initial alignment accesses,
 * we're all done. Again, no point in trying to
 * do byte-by-byte probing if the 4-byte load
 * fails - we're not doing any uncached accesses
 * any more.
 */
_ASM_EXTABLE_UA(90b, .Ldone)
_ASM_EXTABLE_UA(91b, .Ldone)
_ASM_EXTABLE_UA(92b, .Ldone)
_ASM_EXTABLE_UA(93b, .Ldone)
_ASM_EXTABLE_UA(94b, .Ldone)
_ASM_EXTABLE_UA(95b, .Ldone)

/*
 * Exception table fixups for faults in the middle
 */
.Ldone56: sub $8,%edx
.Ldone48: sub $8,%edx
.Ldone40: sub $8,%edx
.Ldone32: sub $8,%edx
.Ldone24: sub $8,%edx
.Ldone16: sub $8,%edx
.Ldone8: sub $8,%edx
.Ldone0:
	mov %edx,%eax
	RET

.Lfixup32:
	addq $32,%rsi
	addq $32,%rdi
	sub $32,%edx
	jmp .Lquadwords

.Llast4:
52:	movl (%rsi),%eax
53:	movnti %eax,(%rdi)
	sfence
	sub $4,%edx
	mov %edx,%eax
	RET
_ASM_EXTABLE_UA(52b, .Ldone0)
_ASM_EXTABLE_UA(53b, .Ldone0)

SYM_FUNC_END(__copy_user_nocache)
EXPORT_SYMBOL(__copy_user_nocache)
x86: rewrite '__copy_user_nocache' function I didn't really want to do this, but as part of all the other changes to the user copy loops, I've been looking at this horror. I tried to clean it up multiple times, but every time I just found more problems, and the way it's written, it's just too hard to fix them. For example, the code is written to do quad-word alignment, and will use regular byte accesses to get to that point. That's fairly simple, but it means that any initial 8-byte alignment will be done with cached copies. However, the code then is very careful to do any 4-byte _tail_ accesses using an uncached 4-byte write, and that was claimed to be relevant in commit a82eee742452 ("x86/uaccess/64: Handle the caching of 4-byte nocache copies properly in __copy_user_nocache()"). So if you do a 4-byte copy using that function, it carefully uses a 4-byte 'movnti' for the destination. But if you were to do a 12-byte copy that is 4-byte aligned, it would _not_ do a 4-byte 'movnti' followed by a 8-byte 'movnti' to keep it all uncached. Instead, it would align the destination to 8 bytes using a byte-at-a-time loop, and then do a 8-byte 'movnti' for the final 8 bytes. The main caller that cares is __copy_user_flushcache(), which knows about this insanity, and has odd cases for it all. But I just can't deal with looking at this kind of "it does one case right, and another related case entirely wrong". And the code really wasn't fixable without hard drugs, which I try to avoid. So instead, rewrite it in a form that hopefully not only gets this right, but is a bit more maintainable. Knock wood. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2023-04-20 22:13:50 +00:00			`/* SPDX-License-Identifier: GPL-2.0-only */`
			`/*`
			`* Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>`
			`*/`

x86/headers: Replace #include <asm/export.h> with #include <linux/export.h> The following commit: ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost") deprecated <asm/export.h>, which is now a wrapper of <linux/export.h>. Use <linux/export.h> in .S as well as in .c files. After all the <asm/export.h> lines are replaced, <asm/export.h> and <asm-generic/export.h> will be removed. Signed-off-by: Masahiro Yamada <masahiroy@kernel.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Link: https://lore.kernel.org/r/20230806145958.380314-2-masahiroy@kernel.org 2023-08-06 14:59:56 +00:00			`#include <linux/export.h>`
x86: rewrite '__copy_user_nocache' function I didn't really want to do this, but as part of all the other changes to the user copy loops, I've been looking at this horror. I tried to clean it up multiple times, but every time I just found more problems, and the way it's written, it's just too hard to fix them. For example, the code is written to do quad-word alignment, and will use regular byte accesses to get to that point. That's fairly simple, but it means that any initial 8-byte alignment will be done with cached copies. However, the code then is very careful to do any 4-byte _tail_ accesses using an uncached 4-byte write, and that was claimed to be relevant in commit a82eee742452 ("x86/uaccess/64: Handle the caching of 4-byte nocache copies properly in __copy_user_nocache()"). So if you do a 4-byte copy using that function, it carefully uses a 4-byte 'movnti' for the destination. But if you were to do a 12-byte copy that is 4-byte aligned, it would _not_ do a 4-byte 'movnti' followed by a 8-byte 'movnti' to keep it all uncached. Instead, it would align the destination to 8 bytes using a byte-at-a-time loop, and then do a 8-byte 'movnti' for the final 8 bytes. The main caller that cares is __copy_user_flushcache(), which knows about this insanity, and has odd cases for it all. But I just can't deal with looking at this kind of "it does one case right, and another related case entirely wrong". And the code really wasn't fixable without hard drugs, which I try to avoid. So instead, rewrite it in a form that hopefully not only gets this right, but is a bit more maintainable. Knock wood. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2023-04-20 22:13:50 +00:00			`#include <linux/linkage.h>`
			`#include <asm/asm.h>`

			`/*`
			`* copy_user_nocache - Uncached memory copy with exception handling`
			`*`
			`* This copies from user space into kernel space, but the kernel`
			`* space accesses can take a machine check exception, so they too`
			`* need exception handling.`
			`*`
			`* Note: only 32-bit and 64-bit stores have non-temporal versions,`
			`* and we only use aligned versions. Any unaligned parts at the`
			`* start or end of the copy will be done using normal cached stores.`
			`*`
			`* Input:`
			`* rdi destination`
			`* rsi source`
			`* edx count`
			`*`
			`* Output:`
			`* rax uncopied bytes or 0 if successful.`
			`*/`
			`SYM_FUNC_START(__copy_user_nocache)`
			`/* If destination is not 7-byte aligned, we'll have to align it */`
			`testb $7,%dil`
			`jne .Lalign`

			`.Lis_aligned:`
			`cmp $64,%edx`
			`jb .Lquadwords`

			`.p2align 4,0x90`
			`.Lunrolled:`
			`10: movq (%rsi),%r8`
			`11: movq 8(%rsi),%r9`
			`12: movq 16(%rsi),%r10`
			`13: movq 24(%rsi),%r11`
			`20: movnti %r8,(%rdi)`
			`21: movnti %r9,8(%rdi)`
			`22: movnti %r10,16(%rdi)`
			`23: movnti %r11,24(%rdi)`
			`30: movq 32(%rsi),%r8`
			`31: movq 40(%rsi),%r9`
			`32: movq 48(%rsi),%r10`
			`33: movq 56(%rsi),%r11`
			`40: movnti %r8,32(%rdi)`
			`41: movnti %r9,40(%rdi)`
			`42: movnti %r10,48(%rdi)`
			`43: movnti %r11,56(%rdi)`

			`addq $64,%rsi`
			`addq $64,%rdi`
			`sub $64,%edx`
			`cmp $64,%edx`
			`jae .Lunrolled`

			`/*`
			`* First set of user mode loads have been done`
			`* without any stores, so if they fail, we can`
			`* just try the non-unrolled loop.`
			`*/`
			`_ASM_EXTABLE_UA(10b, .Lquadwords)`
			`_ASM_EXTABLE_UA(11b, .Lquadwords)`
			`_ASM_EXTABLE_UA(12b, .Lquadwords)`
			`_ASM_EXTABLE_UA(13b, .Lquadwords)`

			`/*`
			`* The second set of user mode loads have been`
			`* done with 32 bytes stored to the destination,`
			`* so we need to take that into account before`
			`* falling back to the unrolled loop.`
			`*/`
			`_ASM_EXTABLE_UA(30b, .Lfixup32)`
			`_ASM_EXTABLE_UA(31b, .Lfixup32)`
			`_ASM_EXTABLE_UA(32b, .Lfixup32)`
			`_ASM_EXTABLE_UA(33b, .Lfixup32)`

			`/*`
			`* An exception on a write means that we're`
			`* done, but we need to update the count`
			`* depending on where in the unrolled loop`
			`* we were.`
			`*/`
			`_ASM_EXTABLE_UA(20b, .Ldone0)`
			`_ASM_EXTABLE_UA(21b, .Ldone8)`
			`_ASM_EXTABLE_UA(22b, .Ldone16)`
			`_ASM_EXTABLE_UA(23b, .Ldone24)`
			`_ASM_EXTABLE_UA(40b, .Ldone32)`
			`_ASM_EXTABLE_UA(41b, .Ldone40)`
			`_ASM_EXTABLE_UA(42b, .Ldone48)`
			`_ASM_EXTABLE_UA(43b, .Ldone56)`

			`.Lquadwords:`
			`cmp $8,%edx`
			`jb .Llong`
			`50: movq (%rsi),%rax`
			`51: movnti %rax,(%rdi)`
			`addq $8,%rsi`
			`addq $8,%rdi`
			`sub $8,%edx`
			`jmp .Lquadwords`

			`/*`
			`* If we fail on the last full quadword, we will`
			`* not try to do any byte-wise cached accesses.`
			`* We will try to do one more 4-byte uncached`
			`* one, though.`
			`*/`
			`_ASM_EXTABLE_UA(50b, .Llast4)`
			`_ASM_EXTABLE_UA(51b, .Ldone0)`

			`.Llong:`
			`test $4,%dl`
			`je .Lword`
			`60: movl (%rsi),%eax`
			`61: movnti %eax,(%rdi)`
			`addq $4,%rsi`
			`addq $4,%rdi`
			`sub $4,%edx`
			`.Lword:`
			`sfence`
			`test $2,%dl`
			`je .Lbyte`
			`70: movw (%rsi),%ax`
			`71: movw %ax,(%rdi)`
			`addq $2,%rsi`
			`addq $2,%rdi`
			`sub $2,%edx`
			`.Lbyte:`
			`test $1,%dl`
			`je .Ldone`
			`80: movb (%rsi),%al`
			`81: movb %al,(%rdi)`
			`dec %edx`
			`.Ldone:`
			`mov %edx,%eax`
			`RET`

			`/*`
			`* If we fail on the last four bytes, we won't`
			`* bother with any fixups. It's dead, Jim. Note`
			`* that there's no need for 'sfence' for any`
			`* of this, since the exception will have been`
			`* serializing.`
			`*/`
			`_ASM_EXTABLE_UA(60b, .Ldone)`
			`_ASM_EXTABLE_UA(61b, .Ldone)`
			`_ASM_EXTABLE_UA(70b, .Ldone)`
			`_ASM_EXTABLE_UA(71b, .Ldone)`
			`_ASM_EXTABLE_UA(80b, .Ldone)`
			`_ASM_EXTABLE_UA(81b, .Ldone)`

			`/*`
			`* This is the "head needs aliging" case when`
			`* the destination isn't 8-byte aligned. The`
			`* 4-byte case can be done uncached, but any`
			`* smaller alignment is done with regular stores.`
			`*/`
			`.Lalign:`
			`test $1,%dil`
			`je .Lalign_word`
			`test %edx,%edx`
			`je .Ldone`
			`90: movb (%rsi),%al`
			`91: movb %al,(%rdi)`
			`inc %rsi`
			`inc %rdi`
			`dec %edx`
			`.Lalign_word:`
			`test $2,%dil`
			`je .Lalign_long`
			`cmp $2,%edx`
			`jb .Lbyte`
			`92: movw (%rsi),%ax`
			`93: movw %ax,(%rdi)`
			`addq $2,%rsi`
			`addq $2,%rdi`
			`sub $2,%edx`
			`.Lalign_long:`
			`test $4,%dil`
			`je .Lis_aligned`
			`cmp $4,%edx`
			`jb .Lword`
			`94: movl (%rsi),%eax`
			`95: movnti %eax,(%rdi)`
			`addq $4,%rsi`
			`addq $4,%rdi`
			`sub $4,%edx`
			`jmp .Lis_aligned`

			`/*`
			`* If we fail on the initial alignment accesses,`
			`* we're all done. Again, no point in trying to`
			`* do byte-by-byte probing if the 4-byte load`
			`* fails - we're not doing any uncached accesses`
			`* any more.`
			`*/`
			`_ASM_EXTABLE_UA(90b, .Ldone)`
			`_ASM_EXTABLE_UA(91b, .Ldone)`
			`_ASM_EXTABLE_UA(92b, .Ldone)`
			`_ASM_EXTABLE_UA(93b, .Ldone)`
			`_ASM_EXTABLE_UA(94b, .Ldone)`
			`_ASM_EXTABLE_UA(95b, .Ldone)`

			`/*`
			`* Exception table fixups for faults in the middle`
			`*/`
			`.Ldone56: sub $8,%edx`
			`.Ldone48: sub $8,%edx`
			`.Ldone40: sub $8,%edx`
			`.Ldone32: sub $8,%edx`
			`.Ldone24: sub $8,%edx`
			`.Ldone16: sub $8,%edx`
			`.Ldone8: sub $8,%edx`
			`.Ldone0:`
			`mov %edx,%eax`
			`RET`

			`.Lfixup32:`
			`addq $32,%rsi`
			`addq $32,%rdi`
			`sub $32,%edx`
			`jmp .Lquadwords`

			`.Llast4:`
			`52: movl (%rsi),%eax`
			`53: movnti %eax,(%rdi)`
			`sfence`
			`sub $4,%edx`
			`mov %edx,%eax`
			`RET`
			`_ASM_EXTABLE_UA(52b, .Ldone0)`
			`_ASM_EXTABLE_UA(53b, .Ldone0)`

			`SYM_FUNC_END(__copy_user_nocache)`
			`EXPORT_SYMBOL(__copy_user_nocache)`