linux/arch/x86/lib/checksum_32.S
Denys Vlasenko 3e1aa7cb59 x86/asm: Optimize unnecessarily wide TEST instructions
By the nature of the TEST operation, it is often possible to test
a narrower part of the operand:

    "testl $3,  mem"  ->  "testb $3, mem",
    "testq $3, %rcx"  ->  "testb $3, %cl"

This results in shorter instructions, because the TEST instruction
has no sign-entending byte-immediate forms unlike other ALU ops.

Note that this change does not create any LCP (Length-Changing Prefix)
stalls, which happen when adding a 0x66 prefix, which happens when
16-bit immediates are used, which changes such TEST instructions:

  [test_opcode] [modrm] [imm32]

to:

  [0x66] [test_opcode] [modrm] [imm16]

where [imm16] has a *different length* now: 2 bytes instead of 4.
This confuses the decoder and slows down execution.

REX prefixes were carefully designed to almost never hit this case:
adding REX prefix does not change instruction length except MOVABS
and MOV [addr],RAX instruction.

This patch does not add instructions which would use a 0x66 prefix,
code changes in assembly are:

    -48 f7 07 01 00 00 00 	testq  $0x1,(%rdi)
    +f6 07 01             	testb  $0x1,(%rdi)
    -48 f7 c1 01 00 00 00 	test   $0x1,%rcx
    +f6 c1 01             	test   $0x1,%cl
    -48 f7 c1 02 00 00 00 	test   $0x2,%rcx
    +f6 c1 02             	test   $0x2,%cl
    -41 f7 c2 01 00 00 00 	test   $0x1,%r10d
    +41 f6 c2 01          	test   $0x1,%r10b
    -48 f7 c1 04 00 00 00 	test   $0x4,%rcx
    +f6 c1 04             	test   $0x4,%cl
    -48 f7 c1 08 00 00 00 	test   $0x8,%rcx
    +f6 c1 08             	test   $0x8,%cl

Linus further notes:

   "There are no stalls from using 8-bit instruction forms.

    Now, changing from 64-bit or 32-bit 'test' instructions to 8-bit ones
    *could* cause problems if it ends up having forwarding issues, so that
    instead of just forwarding the result, you end up having to wait for
    it to be stable in the L1 cache (or possibly the register file). The
    forwarding from the store buffer is simplest and most reliable if the
    read is done at the exact same address and the exact same size as the
    write that gets forwarded.

    But that's true only if:

     (a) the write was very recent and is still in the write queue. I'm
         not sure that's the case here anyway.

     (b) on at least most Intel microarchitectures, you have to test a
         different byte than the lowest one (so forwarding a 64-bit write
         to a 8-bit read ends up working fine, as long as the 8-bit read
         is of the low 8 bits of the written data).

    A very similar issue *might* show up for registers too, not just
    memory writes, if you use 'testb' with a high-byte register (where
    instead of forwarding the value from the original producer it needs to
    go through the register file and then shifted). But it's mainly a
    problem for store buffers.

    But afaik, the way Denys changed the test instructions, neither of the
    above issues should be true.

    The real problem for store buffer forwarding tends to be "write 8
    bits, read 32 bits". That can be really surprisingly expensive,
    because the read ends up having to wait until the write has hit the
    cacheline, and we might talk tens of cycles of latency here. But
    "write 32 bits, read the low 8 bits" *should* be fast on pretty much
    all x86 chips, afaik."

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1425675332-31576-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-07 11:12:43 +01:00

503 lines
10 KiB
ArmAsm

/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* IP/TCP/UDP checksumming routines
*
* Authors: Jorge Cwik, <jorge@laser.satlink.net>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Tom May, <ftom@netcom.com>
* Pentium Pro/II routines:
* Alexander Kjeldaas <astor@guardian.no>
* Finn Arne Gangstad <finnag@guardian.no>
* Lots of code moved from tcp.c and ip.c; see those files
* for more names.
*
* Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
* handling.
* Andi Kleen, add zeroing on error
* converted to pure assembler
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/errno.h>
#include <asm/asm.h>
/*
* computes a partial checksum, e.g. for TCP/UDP fragments
*/
/*
unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
*/
.text
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
/*
* Experiments with Ethernet and SLIP connections show that buff
* is aligned on either a 2-byte or 4-byte boundary. We get at
* least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
* Fortunately, it is easy to convert 2-byte alignment to 4-byte
* alignment for the unrolled loop.
*/
ENTRY(csum_partial)
CFI_STARTPROC
pushl_cfi_reg esi
pushl_cfi_reg ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: unsigned char *buff
testl $3, %esi # Check alignment.
jz 2f # Jump if alignment is ok.
testl $1, %esi # Check alignment.
jz 10f # Jump if alignment is boundary of 2 bytes.
# buf is odd
dec %ecx
jl 8f
movzbl (%esi), %ebx
adcl %ebx, %eax
roll $8, %eax
inc %esi
testl $2, %esi
jz 2f
10:
subl $2, %ecx # Alignment uses up two bytes.
jae 1f # Jump if we had at least two bytes.
addl $2, %ecx # ecx was < 2. Deal with it.
jmp 4f
1: movw (%esi), %bx
addl $2, %esi
addw %bx, %ax
adcl $0, %eax
2:
movl %ecx, %edx
shrl $5, %ecx
jz 2f
testl %esi, %esi
1: movl (%esi), %ebx
adcl %ebx, %eax
movl 4(%esi), %ebx
adcl %ebx, %eax
movl 8(%esi), %ebx
adcl %ebx, %eax
movl 12(%esi), %ebx
adcl %ebx, %eax
movl 16(%esi), %ebx
adcl %ebx, %eax
movl 20(%esi), %ebx
adcl %ebx, %eax
movl 24(%esi), %ebx
adcl %ebx, %eax
movl 28(%esi), %ebx
adcl %ebx, %eax
lea 32(%esi), %esi
dec %ecx
jne 1b
adcl $0, %eax
2: movl %edx, %ecx
andl $0x1c, %edx
je 4f
shrl $2, %edx # This clears CF
3: adcl (%esi), %eax
lea 4(%esi), %esi
dec %edx
jne 3b
adcl $0, %eax
4: andl $3, %ecx
jz 7f
cmpl $2, %ecx
jb 5f
movw (%esi),%cx
leal 2(%esi),%esi
je 6f
shll $16,%ecx
5: movb (%esi),%cl
6: addl %ecx,%eax
adcl $0, %eax
7:
testb $1, 12(%esp)
jz 8f
roll $8, %eax
8:
popl_cfi_reg ebx
popl_cfi_reg esi
ret
CFI_ENDPROC
ENDPROC(csum_partial)
#else
/* Version for PentiumII/PPro */
ENTRY(csum_partial)
CFI_STARTPROC
pushl_cfi_reg esi
pushl_cfi_reg ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf
testl $3, %esi
jnz 25f
10:
movl %ecx, %edx
movl %ecx, %ebx
andl $0x7c, %ebx
shrl $7, %ecx
addl %ebx,%esi
shrl $2, %ebx
negl %ebx
lea 45f(%ebx,%ebx,2), %ebx
testl %esi, %esi
jmp *%ebx
# Handle 2-byte-aligned regions
20: addw (%esi), %ax
lea 2(%esi), %esi
adcl $0, %eax
jmp 10b
25:
testl $1, %esi
jz 30f
# buf is odd
dec %ecx
jl 90f
movzbl (%esi), %ebx
addl %ebx, %eax
adcl $0, %eax
roll $8, %eax
inc %esi
testl $2, %esi
jz 10b
30: subl $2, %ecx
ja 20b
je 32f
addl $2, %ecx
jz 80f
movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
addl %ebx, %eax
adcl $0, %eax
jmp 80f
32:
addw (%esi), %ax # csumming 2 bytes, 2-aligned
adcl $0, %eax
jmp 80f
40:
addl -128(%esi), %eax
adcl -124(%esi), %eax
adcl -120(%esi), %eax
adcl -116(%esi), %eax
adcl -112(%esi), %eax
adcl -108(%esi), %eax
adcl -104(%esi), %eax
adcl -100(%esi), %eax
adcl -96(%esi), %eax
adcl -92(%esi), %eax
adcl -88(%esi), %eax
adcl -84(%esi), %eax
adcl -80(%esi), %eax
adcl -76(%esi), %eax
adcl -72(%esi), %eax
adcl -68(%esi), %eax
adcl -64(%esi), %eax
adcl -60(%esi), %eax
adcl -56(%esi), %eax
adcl -52(%esi), %eax
adcl -48(%esi), %eax
adcl -44(%esi), %eax
adcl -40(%esi), %eax
adcl -36(%esi), %eax
adcl -32(%esi), %eax
adcl -28(%esi), %eax
adcl -24(%esi), %eax
adcl -20(%esi), %eax
adcl -16(%esi), %eax
adcl -12(%esi), %eax
adcl -8(%esi), %eax
adcl -4(%esi), %eax
45:
lea 128(%esi), %esi
adcl $0, %eax
dec %ecx
jge 40b
movl %edx, %ecx
50: andl $3, %ecx
jz 80f
# Handle the last 1-3 bytes without jumping
notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
movl $0xffffff,%ebx # by the shll and shrl instructions
shll $3,%ecx
shrl %cl,%ebx
andl -128(%esi),%ebx # esi is 4-aligned so should be ok
addl %ebx,%eax
adcl $0,%eax
80:
testb $1, 12(%esp)
jz 90f
roll $8, %eax
90:
popl_cfi_reg ebx
popl_cfi_reg esi
ret
CFI_ENDPROC
ENDPROC(csum_partial)
#endif
/*
unsigned int csum_partial_copy_generic (const char *src, char *dst,
int len, int sum, int *src_err_ptr, int *dst_err_ptr)
*/
/*
* Copy from ds while checksumming, otherwise like csum_partial
*
* The macros SRC and DST specify the type of access for the instruction.
* thus we can call a custom exception handler for all access types.
*
* FIXME: could someone double-check whether I haven't mixed up some SRC and
* DST definitions? It's damn hard to trigger all cases. I hope I got
* them all but there's no guarantee.
*/
#define SRC(y...) \
9999: y; \
_ASM_EXTABLE(9999b, 6001f)
#define DST(y...) \
9999: y; \
_ASM_EXTABLE(9999b, 6002f)
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
#define ARGBASE 16
#define FP 12
ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
subl $4,%esp
CFI_ADJUST_CFA_OFFSET 4
pushl_cfi_reg edi
pushl_cfi_reg esi
pushl_cfi_reg ebx
movl ARGBASE+16(%esp),%eax # sum
movl ARGBASE+12(%esp),%ecx # len
movl ARGBASE+4(%esp),%esi # src
movl ARGBASE+8(%esp),%edi # dst
testl $2, %edi # Check alignment.
jz 2f # Jump if alignment is ok.
subl $2, %ecx # Alignment uses up two bytes.
jae 1f # Jump if we had at least two bytes.
addl $2, %ecx # ecx was < 2. Deal with it.
jmp 4f
SRC(1: movw (%esi), %bx )
addl $2, %esi
DST( movw %bx, (%edi) )
addl $2, %edi
addw %bx, %ax
adcl $0, %eax
2:
movl %ecx, FP(%esp)
shrl $5, %ecx
jz 2f
testl %esi, %esi
SRC(1: movl (%esi), %ebx )
SRC( movl 4(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, (%edi) )
adcl %edx, %eax
DST( movl %edx, 4(%edi) )
SRC( movl 8(%esi), %ebx )
SRC( movl 12(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 8(%edi) )
adcl %edx, %eax
DST( movl %edx, 12(%edi) )
SRC( movl 16(%esi), %ebx )
SRC( movl 20(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 16(%edi) )
adcl %edx, %eax
DST( movl %edx, 20(%edi) )
SRC( movl 24(%esi), %ebx )
SRC( movl 28(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 24(%edi) )
adcl %edx, %eax
DST( movl %edx, 28(%edi) )
lea 32(%esi), %esi
lea 32(%edi), %edi
dec %ecx
jne 1b
adcl $0, %eax
2: movl FP(%esp), %edx
movl %edx, %ecx
andl $0x1c, %edx
je 4f
shrl $2, %edx # This clears CF
SRC(3: movl (%esi), %ebx )
adcl %ebx, %eax
DST( movl %ebx, (%edi) )
lea 4(%esi), %esi
lea 4(%edi), %edi
dec %edx
jne 3b
adcl $0, %eax
4: andl $3, %ecx
jz 7f
cmpl $2, %ecx
jb 5f
SRC( movw (%esi), %cx )
leal 2(%esi), %esi
DST( movw %cx, (%edi) )
leal 2(%edi), %edi
je 6f
shll $16,%ecx
SRC(5: movb (%esi), %cl )
DST( movb %cl, (%edi) )
6: addl %ecx, %eax
adcl $0, %eax
7:
5000:
# Exception handler:
.section .fixup, "ax"
6001:
movl ARGBASE+20(%esp), %ebx # src_err_ptr
movl $-EFAULT, (%ebx)
# zero the complete destination - computing the rest
# is too much work
movl ARGBASE+8(%esp), %edi # dst
movl ARGBASE+12(%esp), %ecx # len
xorl %eax,%eax
rep ; stosb
jmp 5000b
6002:
movl ARGBASE+24(%esp), %ebx # dst_err_ptr
movl $-EFAULT,(%ebx)
jmp 5000b
.previous
popl_cfi_reg ebx
popl_cfi_reg esi
popl_cfi_reg edi
popl_cfi %ecx # equivalent to addl $4,%esp
ret
CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
#else
/* Version for PentiumII/PPro */
#define ROUND1(x) \
SRC(movl x(%esi), %ebx ) ; \
addl %ebx, %eax ; \
DST(movl %ebx, x(%edi) ) ;
#define ROUND(x) \
SRC(movl x(%esi), %ebx ) ; \
adcl %ebx, %eax ; \
DST(movl %ebx, x(%edi) ) ;
#define ARGBASE 12
ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
pushl_cfi_reg ebx
pushl_cfi_reg edi
pushl_cfi_reg esi
movl ARGBASE+4(%esp),%esi #src
movl ARGBASE+8(%esp),%edi #dst
movl ARGBASE+12(%esp),%ecx #len
movl ARGBASE+16(%esp),%eax #sum
# movl %ecx, %edx
movl %ecx, %ebx
movl %esi, %edx
shrl $6, %ecx
andl $0x3c, %ebx
negl %ebx
subl %ebx, %esi
subl %ebx, %edi
lea -1(%esi),%edx
andl $-32,%edx
lea 3f(%ebx,%ebx), %ebx
testl %esi, %esi
jmp *%ebx
1: addl $64,%esi
addl $64,%edi
SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
3: adcl $0,%eax
addl $64, %edx
dec %ecx
jge 1b
4: movl ARGBASE+12(%esp),%edx #len
andl $3, %edx
jz 7f
cmpl $2, %edx
jb 5f
SRC( movw (%esi), %dx )
leal 2(%esi), %esi
DST( movw %dx, (%edi) )
leal 2(%edi), %edi
je 6f
shll $16,%edx
5:
SRC( movb (%esi), %dl )
DST( movb %dl, (%edi) )
6: addl %edx, %eax
adcl $0, %eax
7:
.section .fixup, "ax"
6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
movl $-EFAULT, (%ebx)
# zero the complete destination (computing the rest is too much work)
movl ARGBASE+8(%esp),%edi # dst
movl ARGBASE+12(%esp),%ecx # len
xorl %eax,%eax
rep; stosb
jmp 7b
6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
movl $-EFAULT, (%ebx)
jmp 7b
.previous
popl_cfi_reg esi
popl_cfi_reg edi
popl_cfi_reg ebx
ret
CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
#undef ROUND
#undef ROUND1
#endif