linux/arch/x86/lib/copy_user_64.S

289 lines
5.8 KiB
ArmAsm
Raw Normal View History

/*
* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
* Copyright 2002 Andi Kleen, SuSE Labs.
* Subject to the GNU Public License v2.
*
* Functions to copy from and to user space.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/smap.h>
/*
* By placing feature2 after feature1 in altinstructions section, we logically
* implement:
* If CPU has feature2, jmp to alt2 is used
* else if CPU has feature1, jmp to alt1 is used
* else jmp to orig is used.
*/
.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
0:
x86/alternatives: Make JMPs more robust Up until now we had to pay attention to relative JMPs in alternatives about how their relative offset gets computed so that the jump target is still correct. Or, as it is the case for near CALLs (opcode e8), we still have to go and readjust the offset at patching time. What is more, the static_cpu_has_safe() facility had to forcefully generate 5-byte JMPs since we couldn't rely on the compiler to generate properly sized ones so we had to force the longest ones. Worse than that, sometimes it would generate a replacement JMP which is longer than the original one, thus overwriting the beginning of the next instruction at patching time. So, in order to alleviate all that and make using JMPs more straight-forward we go and pad the original instruction in an alternative block with NOPs at build time, should the replacement(s) be longer. This way, alternatives users shouldn't pay special attention so that original and replacement instruction sizes are fine but the assembler would simply add padding where needed and not do anything otherwise. As a second aspect, we go and recompute JMPs at patching time so that we can try to make 5-byte JMPs into two-byte ones if possible. If not, we still have to recompute the offsets as the replacement JMP gets put far away in the .altinstr_replacement section leading to a wrong offset if copied verbatim. For example, on a locally generated kernel image old insn VA: 0xffffffff810014bd, CPU feat: X86_FEATURE_ALWAYS, size: 2 __switch_to: ffffffff810014bd: eb 21 jmp ffffffff810014e0 repl insn: size: 5 ffffffff81d0b23c: e9 b1 62 2f ff jmpq ffffffff810014f2 gets corrected to a 2-byte JMP: apply_alternatives: feat: 3*32+21, old: (ffffffff810014bd, len: 2), repl: (ffffffff81d0b23c, len: 5) alt_insn: e9 b1 62 2f ff recompute_jumps: next_rip: ffffffff81d0b241, tgt_rip: ffffffff810014f2, new_displ: 0x00000033, ret len: 2 converted to: eb 33 90 90 90 and a 5-byte JMP: old insn VA: 0xffffffff81001516, CPU feat: X86_FEATURE_ALWAYS, size: 2 __switch_to: ffffffff81001516: eb 30 jmp ffffffff81001548 repl insn: size: 5 ffffffff81d0b241: e9 10 63 2f ff jmpq ffffffff81001556 gets shortened into a two-byte one: apply_alternatives: feat: 3*32+21, old: (ffffffff81001516, len: 2), repl: (ffffffff81d0b241, len: 5) alt_insn: e9 10 63 2f ff recompute_jumps: next_rip: ffffffff81d0b246, tgt_rip: ffffffff81001556, new_displ: 0x0000003e, ret len: 2 converted to: eb 3e 90 90 90 ... and so on. This leads to a net win of around 40ish replacements * 3 bytes savings =~ 120 bytes of I$ on an AMD guest which means some savings of precious instruction cache bandwidth. The padding to the shorter 2-byte JMPs are single-byte NOPs which on smart microarchitectures means discarding NOPs at decode time and thus freeing up execution bandwidth. Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-05 12:48:41 +00:00
jmp \orig
1:
.section .altinstr_replacement,"ax"
x86/alternatives: Make JMPs more robust Up until now we had to pay attention to relative JMPs in alternatives about how their relative offset gets computed so that the jump target is still correct. Or, as it is the case for near CALLs (opcode e8), we still have to go and readjust the offset at patching time. What is more, the static_cpu_has_safe() facility had to forcefully generate 5-byte JMPs since we couldn't rely on the compiler to generate properly sized ones so we had to force the longest ones. Worse than that, sometimes it would generate a replacement JMP which is longer than the original one, thus overwriting the beginning of the next instruction at patching time. So, in order to alleviate all that and make using JMPs more straight-forward we go and pad the original instruction in an alternative block with NOPs at build time, should the replacement(s) be longer. This way, alternatives users shouldn't pay special attention so that original and replacement instruction sizes are fine but the assembler would simply add padding where needed and not do anything otherwise. As a second aspect, we go and recompute JMPs at patching time so that we can try to make 5-byte JMPs into two-byte ones if possible. If not, we still have to recompute the offsets as the replacement JMP gets put far away in the .altinstr_replacement section leading to a wrong offset if copied verbatim. For example, on a locally generated kernel image old insn VA: 0xffffffff810014bd, CPU feat: X86_FEATURE_ALWAYS, size: 2 __switch_to: ffffffff810014bd: eb 21 jmp ffffffff810014e0 repl insn: size: 5 ffffffff81d0b23c: e9 b1 62 2f ff jmpq ffffffff810014f2 gets corrected to a 2-byte JMP: apply_alternatives: feat: 3*32+21, old: (ffffffff810014bd, len: 2), repl: (ffffffff81d0b23c, len: 5) alt_insn: e9 b1 62 2f ff recompute_jumps: next_rip: ffffffff81d0b241, tgt_rip: ffffffff810014f2, new_displ: 0x00000033, ret len: 2 converted to: eb 33 90 90 90 and a 5-byte JMP: old insn VA: 0xffffffff81001516, CPU feat: X86_FEATURE_ALWAYS, size: 2 __switch_to: ffffffff81001516: eb 30 jmp ffffffff81001548 repl insn: size: 5 ffffffff81d0b241: e9 10 63 2f ff jmpq ffffffff81001556 gets shortened into a two-byte one: apply_alternatives: feat: 3*32+21, old: (ffffffff81001516, len: 2), repl: (ffffffff81d0b241, len: 5) alt_insn: e9 10 63 2f ff recompute_jumps: next_rip: ffffffff81d0b246, tgt_rip: ffffffff81001556, new_displ: 0x0000003e, ret len: 2 converted to: eb 3e 90 90 90 ... and so on. This leads to a net win of around 40ish replacements * 3 bytes savings =~ 120 bytes of I$ on an AMD guest which means some savings of precious instruction cache bandwidth. The padding to the shorter 2-byte JMPs are single-byte NOPs which on smart microarchitectures means discarding NOPs at decode time and thus freeing up execution bandwidth. Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-05 12:48:41 +00:00
2:
jmp \alt1
3:
jmp \alt2
.previous
.section .altinstructions,"a"
altinstruction_entry 0b,2b,\feature1,5,5,0
altinstruction_entry 0b,3b,\feature2,5,5,0
.previous
.endm
.macro ALIGN_DESTINATION
/* check for bad alignment of destination */
movl %edi,%ecx
andl $7,%ecx
jz 102f /* already aligned */
subl $8,%ecx
negl %ecx
subl %ecx,%edx
100: movb (%rsi),%al
101: movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz 100b
102:
.section .fixup,"ax"
103: addl %ecx,%edx /* ecx is zerorest also */
jmp copy_user_handle_tail
.previous
_ASM_EXTABLE(100b,103b)
_ASM_EXTABLE(101b,103b)
.endm
/* Standard copy_to_user with segment limit checking */
ENTRY(_copy_to_user)
CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rdi,%rcx
addq %rdx,%rcx
jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx
x86, 64-bit: Fix copy_[to/from]_user() checks for the userspace address limit As reported in BZ #30352: https://bugzilla.kernel.org/show_bug.cgi?id=30352 there's a kernel bug related to reading the last allowed page on x86_64. The _copy_to_user() and _copy_from_user() functions use the following check for address limit: if (buf + size >= limit) fail(); while it should be more permissive: if (buf + size > limit) fail(); That's because the size represents the number of bytes being read/write from/to buf address AND including the buf address. So the copy function will actually never touch the limit address even if "buf + size == limit". Following program fails to use the last page as buffer due to the wrong limit check: #include <sys/mman.h> #include <sys/socket.h> #include <assert.h> #define PAGE_SIZE (4096) #define LAST_PAGE ((void*)(0x7fffffffe000)) int main() { int fds[2], err; void * ptr = mmap(LAST_PAGE, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); assert(ptr == LAST_PAGE); err = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); assert(err == 0); err = send(fds[0], ptr, PAGE_SIZE, 0); perror("send"); assert(err == PAGE_SIZE); err = recv(fds[1], ptr, PAGE_SIZE, MSG_WAITALL); perror("recv"); assert(err == PAGE_SIZE); return 0; } The other place checking the addr limit is the access_ok() function, which is working properly. There's just a misleading comment for the __range_not_ok() macro - which this patch fixes as well. The last page of the user-space address range is a guard page and Brian Gerst observed that the guard page itself due to an erratum on K8 cpus (#121 Sequential Execution Across Non-Canonical Boundary Causes Processor Hang). However, the test code is using the last valid page before the guard page. The bug is that the last byte before the guard page can't be read because of the off-by-one error. The guard page is left in place. This bug would normally not show up because the last page is part of the process stack and never accessed via syscalls. Signed-off-by: Jiri Olsa <jolsa@redhat.com> Acked-by: Brian Gerst <brgerst@gmail.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: <stable@kernel.org> Link: http://lkml.kernel.org/r/1305210630-7136-1-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-12 14:30:30 +00:00
ja bad_to_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
copy_user_generic_unrolled,copy_user_generic_string, \
copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_to_user)
/* Standard copy_from_user with segment limit checking */
ENTRY(_copy_from_user)
CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rsi,%rcx
addq %rdx,%rcx
jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx
x86, 64-bit: Fix copy_[to/from]_user() checks for the userspace address limit As reported in BZ #30352: https://bugzilla.kernel.org/show_bug.cgi?id=30352 there's a kernel bug related to reading the last allowed page on x86_64. The _copy_to_user() and _copy_from_user() functions use the following check for address limit: if (buf + size >= limit) fail(); while it should be more permissive: if (buf + size > limit) fail(); That's because the size represents the number of bytes being read/write from/to buf address AND including the buf address. So the copy function will actually never touch the limit address even if "buf + size == limit". Following program fails to use the last page as buffer due to the wrong limit check: #include <sys/mman.h> #include <sys/socket.h> #include <assert.h> #define PAGE_SIZE (4096) #define LAST_PAGE ((void*)(0x7fffffffe000)) int main() { int fds[2], err; void * ptr = mmap(LAST_PAGE, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); assert(ptr == LAST_PAGE); err = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); assert(err == 0); err = send(fds[0], ptr, PAGE_SIZE, 0); perror("send"); assert(err == PAGE_SIZE); err = recv(fds[1], ptr, PAGE_SIZE, MSG_WAITALL); perror("recv"); assert(err == PAGE_SIZE); return 0; } The other place checking the addr limit is the access_ok() function, which is working properly. There's just a misleading comment for the __range_not_ok() macro - which this patch fixes as well. The last page of the user-space address range is a guard page and Brian Gerst observed that the guard page itself due to an erratum on K8 cpus (#121 Sequential Execution Across Non-Canonical Boundary Causes Processor Hang). However, the test code is using the last valid page before the guard page. The bug is that the last byte before the guard page can't be read because of the off-by-one error. The guard page is left in place. This bug would normally not show up because the last page is part of the process stack and never accessed via syscalls. Signed-off-by: Jiri Olsa <jolsa@redhat.com> Acked-by: Brian Gerst <brgerst@gmail.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: <stable@kernel.org> Link: http://lkml.kernel.org/r/1305210630-7136-1-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-12 14:30:30 +00:00
ja bad_from_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
copy_user_generic_unrolled,copy_user_generic_string, \
copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_from_user)
.section .fixup,"ax"
/* must zero dest */
ENTRY(bad_from_user)
bad_from_user:
CFI_STARTPROC
movl %edx,%ecx
xorl %eax,%eax
rep
stosb
bad_to_user:
movl %edx,%eax
ret
CFI_ENDPROC
ENDPROC(bad_from_user)
.previous
/*
* copy_user_generic_unrolled - memory copy with exception handling.
* This version is for CPUs like P4 that don't have efficient micro
* code for rep movsq
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_generic_unrolled)
CFI_STARTPROC
ASM_STAC
cmpl $8,%edx
jb 20f /* less then 8 bytes, go to byte copy loop */
ALIGN_DESTINATION
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
jz 17f
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
4: movq 3*8(%rsi),%r11
5: movq %r8,(%rdi)
6: movq %r9,1*8(%rdi)
7: movq %r10,2*8(%rdi)
8: movq %r11,3*8(%rdi)
9: movq 4*8(%rsi),%r8
10: movq 5*8(%rsi),%r9
11: movq 6*8(%rsi),%r10
12: movq 7*8(%rsi),%r11
13: movq %r8,4*8(%rdi)
14: movq %r9,5*8(%rdi)
15: movq %r10,6*8(%rdi)
16: movq %r11,7*8(%rdi)
leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
decl %ecx
jnz 1b
17: movl %edx,%ecx
andl $7,%edx
shrl $3,%ecx
jz 20f
18: movq (%rsi),%r8
19: movq %r8,(%rdi)
leaq 8(%rsi),%rsi
leaq 8(%rdi),%rdi
decl %ecx
jnz 18b
20: andl %edx,%edx
jz 23f
movl %edx,%ecx
21: movb (%rsi),%al
22: movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz 21b
23: xor %eax,%eax
ASM_CLAC
ret
.section .fixup,"ax"
30: shll $6,%ecx
addl %ecx,%edx
jmp 60f
40: leal (%rdx,%rcx,8),%edx
jmp 60f
50: movl %ecx,%edx
60: jmp copy_user_handle_tail /* ecx is zerorest also */
.previous
_ASM_EXTABLE(1b,30b)
_ASM_EXTABLE(2b,30b)
_ASM_EXTABLE(3b,30b)
_ASM_EXTABLE(4b,30b)
_ASM_EXTABLE(5b,30b)
_ASM_EXTABLE(6b,30b)
_ASM_EXTABLE(7b,30b)
_ASM_EXTABLE(8b,30b)
_ASM_EXTABLE(9b,30b)
_ASM_EXTABLE(10b,30b)
_ASM_EXTABLE(11b,30b)
_ASM_EXTABLE(12b,30b)
_ASM_EXTABLE(13b,30b)
_ASM_EXTABLE(14b,30b)
_ASM_EXTABLE(15b,30b)
_ASM_EXTABLE(16b,30b)
_ASM_EXTABLE(18b,40b)
_ASM_EXTABLE(19b,40b)
_ASM_EXTABLE(21b,50b)
_ASM_EXTABLE(22b,50b)
CFI_ENDPROC
ENDPROC(copy_user_generic_unrolled)
/* Some CPUs run faster using the string copy instructions.
* This is also a lot simpler. Use them when possible.
*
* Only 4GB of copy is supported. This shouldn't be a problem
* because the kernel normally only writes from/to page sized chunks
* even if user space passed a longer buffer.
* And more would be dangerous because both Intel and AMD have
* errata with rep movsq > 4GB. If someone feels the need to fix
* this please consider this.
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_generic_string)
CFI_STARTPROC
ASM_STAC
cmpl $8,%edx
jb 2f /* less than 8 bytes, go to byte copy loop */
ALIGN_DESTINATION
movl %edx,%ecx
shrl $3,%ecx
andl $7,%edx
1: rep
movsq
2: movl %edx,%ecx
3: rep
movsb
xorl %eax,%eax
ASM_CLAC
ret
.section .fixup,"ax"
11: leal (%rdx,%rcx,8),%ecx
12: movl %ecx,%edx /* ecx is zerorest also */
jmp copy_user_handle_tail
.previous
_ASM_EXTABLE(1b,11b)
_ASM_EXTABLE(3b,12b)
CFI_ENDPROC
ENDPROC(copy_user_generic_string)
/*
* Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
* It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_enhanced_fast_string)
CFI_STARTPROC
ASM_STAC
movl %edx,%ecx
1: rep
movsb
xorl %eax,%eax
ASM_CLAC
ret
.section .fixup,"ax"
12: movl %ecx,%edx /* ecx is zerorest also */
jmp copy_user_handle_tail
.previous
_ASM_EXTABLE(1b,12b)
CFI_ENDPROC
ENDPROC(copy_user_enhanced_fast_string)