[CRYPTO] salsa20: Add x86-64 assembly version

This is the x86-64 version of the Salsa20 stream cipher algorithm. The
original assembly code came from
<http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>. It has been
reformatted for clarity.

Signed-off-by: Tan Swee Heng <thesweeheng@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Tan Swee Heng 2007-12-18 00:04:40 +08:00 committed by Herbert Xu
parent 974e4b752e
commit 9a7dafbba4
4 changed files with 939 additions and 0 deletions

View File

@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
@ -15,3 +16,4 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o

View File

@ -0,0 +1,920 @@
# enter ECRYPT_encrypt_bytes
.text
.p2align 5
.globl ECRYPT_encrypt_bytes
ECRYPT_encrypt_bytes:
mov %rsp,%r11
and $31,%r11
add $256,%r11
sub %r11,%rsp
# x = arg1
mov %rdi,%r8
# m = arg2
mov %rsi,%rsi
# out = arg3
mov %rdx,%rdi
# bytes = arg4
mov %rcx,%rdx
# unsigned>? bytes - 0
cmp $0,%rdx
# comment:fp stack unchanged by jump
# goto done if !unsigned>
jbe ._done
# comment:fp stack unchanged by fallthrough
# start:
._start:
# r11_stack = r11
movq %r11,0(%rsp)
# r12_stack = r12
movq %r12,8(%rsp)
# r13_stack = r13
movq %r13,16(%rsp)
# r14_stack = r14
movq %r14,24(%rsp)
# r15_stack = r15
movq %r15,32(%rsp)
# rbx_stack = rbx
movq %rbx,40(%rsp)
# rbp_stack = rbp
movq %rbp,48(%rsp)
# in0 = *(uint64 *) (x + 0)
movq 0(%r8),%rcx
# in2 = *(uint64 *) (x + 8)
movq 8(%r8),%r9
# in4 = *(uint64 *) (x + 16)
movq 16(%r8),%rax
# in6 = *(uint64 *) (x + 24)
movq 24(%r8),%r10
# in8 = *(uint64 *) (x + 32)
movq 32(%r8),%r11
# in10 = *(uint64 *) (x + 40)
movq 40(%r8),%r12
# in12 = *(uint64 *) (x + 48)
movq 48(%r8),%r13
# in14 = *(uint64 *) (x + 56)
movq 56(%r8),%r14
# j0 = in0
movq %rcx,56(%rsp)
# j2 = in2
movq %r9,64(%rsp)
# j4 = in4
movq %rax,72(%rsp)
# j6 = in6
movq %r10,80(%rsp)
# j8 = in8
movq %r11,88(%rsp)
# j10 = in10
movq %r12,96(%rsp)
# j12 = in12
movq %r13,104(%rsp)
# j14 = in14
movq %r14,112(%rsp)
# x_backup = x
movq %r8,120(%rsp)
# bytesatleast1:
._bytesatleast1:
# unsigned<? bytes - 64
cmp $64,%rdx
# comment:fp stack unchanged by jump
# goto nocopy if !unsigned<
jae ._nocopy
# ctarget = out
movq %rdi,128(%rsp)
# out = &tmp
leaq 192(%rsp),%rdi
# i = bytes
mov %rdx,%rcx
# while (i) { *out++ = *m++; --i }
rep movsb
# out = &tmp
leaq 192(%rsp),%rdi
# m = &tmp
leaq 192(%rsp),%rsi
# comment:fp stack unchanged by fallthrough
# nocopy:
._nocopy:
# out_backup = out
movq %rdi,136(%rsp)
# m_backup = m
movq %rsi,144(%rsp)
# bytes_backup = bytes
movq %rdx,152(%rsp)
# x1 = j0
movq 56(%rsp),%rdi
# x0 = x1
mov %rdi,%rdx
# (uint64) x1 >>= 32
shr $32,%rdi
# x3 = j2
movq 64(%rsp),%rsi
# x2 = x3
mov %rsi,%rcx
# (uint64) x3 >>= 32
shr $32,%rsi
# x5 = j4
movq 72(%rsp),%r8
# x4 = x5
mov %r8,%r9
# (uint64) x5 >>= 32
shr $32,%r8
# x5_stack = x5
movq %r8,160(%rsp)
# x7 = j6
movq 80(%rsp),%r8
# x6 = x7
mov %r8,%rax
# (uint64) x7 >>= 32
shr $32,%r8
# x9 = j8
movq 88(%rsp),%r10
# x8 = x9
mov %r10,%r11
# (uint64) x9 >>= 32
shr $32,%r10
# x11 = j10
movq 96(%rsp),%r12
# x10 = x11
mov %r12,%r13
# x10_stack = x10
movq %r13,168(%rsp)
# (uint64) x11 >>= 32
shr $32,%r12
# x13 = j12
movq 104(%rsp),%r13
# x12 = x13
mov %r13,%r14
# (uint64) x13 >>= 32
shr $32,%r13
# x15 = j14
movq 112(%rsp),%r15
# x14 = x15
mov %r15,%rbx
# (uint64) x15 >>= 32
shr $32,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# i = 20
mov $20,%r15
# mainloop:
._mainloop:
# i_backup = i
movq %r15,184(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x12 + x0
lea (%r14,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x4 ^= a
xor %rbp,%r9
# b = x1 + x5
lea (%rdi,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x9 ^= b
xor %rbp,%r10
# a = x0 + x4
lea (%rdx,%r9),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x8 ^= a
xor %rbp,%r11
# b = x5 + x9
lea (%r15,%r10),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x13 ^= b
xor %rbp,%r13
# a = x4 + x8
lea (%r9,%r11),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x12 ^= a
xor %rbp,%r14
# b = x9 + x13
lea (%r10,%r13),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x1 ^= b
xor %rbp,%rdi
# a = x8 + x12
lea (%r11,%r14),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x13 + x1
lea (%r13,%rdi),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x6 + x10
lea (%rax,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x14 ^= c
xor %r15,%rbx
# c = x10 + x14
lea (%rbp,%rbx),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x2 ^= c
xor %r15,%rcx
# c = x14 + x2
lea (%rbx,%rcx),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x6 ^= c
xor %r15,%rax
# c = x2 + x6
lea (%rcx,%rax),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x11 + x15
lea (%r12,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x3 ^= d
xor %rbp,%rsi
# d = x15 + x3
lea (%r15,%rsi),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x7 ^= d
xor %rbp,%r8
# d = x3 + x7
lea (%rsi,%r8),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x11 ^= d
xor %rbp,%r12
# d = x7 + x11
lea (%r8,%r12),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x3 + x0
lea (%rsi,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x1 ^= a
xor %rbp,%rdi
# b = x4 + x5
lea (%r9,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x6 ^= b
xor %rbp,%rax
# a = x0 + x1
lea (%rdx,%rdi),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x2 ^= a
xor %rbp,%rcx
# b = x5 + x6
lea (%r15,%rax),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x7 ^= b
xor %rbp,%r8
# a = x1 + x2
lea (%rdi,%rcx),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x3 ^= a
xor %rbp,%rsi
# b = x6 + x7
lea (%rax,%r8),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x4 ^= b
xor %rbp,%r9
# a = x2 + x3
lea (%rcx,%rsi),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x7 + x4
lea (%r8,%r9),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x9 + x10
lea (%r10,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x11 ^= c
xor %r15,%r12
# c = x10 + x11
lea (%rbp,%r12),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x8 ^= c
xor %r15,%r11
# c = x11 + x8
lea (%r12,%r11),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x9 ^= c
xor %r15,%r10
# c = x8 + x9
lea (%r11,%r10),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x14 + x15
lea (%rbx,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x12 ^= d
xor %rbp,%r14
# d = x15 + x12
lea (%r15,%r14),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x13 ^= d
xor %rbp,%r13
# d = x12 + x13
lea (%r14,%r13),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x14 ^= d
xor %rbp,%rbx
# d = x13 + x14
lea (%r13,%rbx),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x12 + x0
lea (%r14,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x4 ^= a
xor %rbp,%r9
# b = x1 + x5
lea (%rdi,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x9 ^= b
xor %rbp,%r10
# a = x0 + x4
lea (%rdx,%r9),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x8 ^= a
xor %rbp,%r11
# b = x5 + x9
lea (%r15,%r10),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x13 ^= b
xor %rbp,%r13
# a = x4 + x8
lea (%r9,%r11),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x12 ^= a
xor %rbp,%r14
# b = x9 + x13
lea (%r10,%r13),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x1 ^= b
xor %rbp,%rdi
# a = x8 + x12
lea (%r11,%r14),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x13 + x1
lea (%r13,%rdi),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x6 + x10
lea (%rax,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x14 ^= c
xor %r15,%rbx
# c = x10 + x14
lea (%rbp,%rbx),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x2 ^= c
xor %r15,%rcx
# c = x14 + x2
lea (%rbx,%rcx),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x6 ^= c
xor %r15,%rax
# c = x2 + x6
lea (%rcx,%rax),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x11 + x15
lea (%r12,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x3 ^= d
xor %rbp,%rsi
# d = x15 + x3
lea (%r15,%rsi),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x7 ^= d
xor %rbp,%r8
# d = x3 + x7
lea (%rsi,%r8),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x11 ^= d
xor %rbp,%r12
# d = x7 + x11
lea (%r8,%r12),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x3 + x0
lea (%rsi,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x1 ^= a
xor %rbp,%rdi
# b = x4 + x5
lea (%r9,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x6 ^= b
xor %rbp,%rax
# a = x0 + x1
lea (%rdx,%rdi),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x2 ^= a
xor %rbp,%rcx
# b = x5 + x6
lea (%r15,%rax),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x7 ^= b
xor %rbp,%r8
# a = x1 + x2
lea (%rdi,%rcx),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x3 ^= a
xor %rbp,%rsi
# b = x6 + x7
lea (%rax,%r8),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x4 ^= b
xor %rbp,%r9
# a = x2 + x3
lea (%rcx,%rsi),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x7 + x4
lea (%r8,%r9),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x9 + x10
lea (%r10,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x11 ^= c
xor %r15,%r12
# c = x10 + x11
lea (%rbp,%r12),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x8 ^= c
xor %r15,%r11
# c = x11 + x8
lea (%r12,%r11),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x9 ^= c
xor %r15,%r10
# c = x8 + x9
lea (%r11,%r10),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x14 + x15
lea (%rbx,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x12 ^= d
xor %rbp,%r14
# d = x15 + x12
lea (%r15,%r14),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x13 ^= d
xor %rbp,%r13
# d = x12 + x13
lea (%r14,%r13),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x14 ^= d
xor %rbp,%rbx
# d = x13 + x14
lea (%r13,%rbx),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# i = i_backup
movq 184(%rsp),%r15
# unsigned>? i -= 4
sub $4,%r15
# comment:fp stack unchanged by jump
# goto mainloop if unsigned>
ja ._mainloop
# (uint32) x2 += j2
addl 64(%rsp),%ecx
# x3 <<= 32
shl $32,%rsi
# x3 += j2
addq 64(%rsp),%rsi
# (uint64) x3 >>= 32
shr $32,%rsi
# x3 <<= 32
shl $32,%rsi
# x2 += x3
add %rsi,%rcx
# (uint32) x6 += j6
addl 80(%rsp),%eax
# x7 <<= 32
shl $32,%r8
# x7 += j6
addq 80(%rsp),%r8
# (uint64) x7 >>= 32
shr $32,%r8
# x7 <<= 32
shl $32,%r8
# x6 += x7
add %r8,%rax
# (uint32) x8 += j8
addl 88(%rsp),%r11d
# x9 <<= 32
shl $32,%r10
# x9 += j8
addq 88(%rsp),%r10
# (uint64) x9 >>= 32
shr $32,%r10
# x9 <<= 32
shl $32,%r10
# x8 += x9
add %r10,%r11
# (uint32) x12 += j12
addl 104(%rsp),%r14d
# x13 <<= 32
shl $32,%r13
# x13 += j12
addq 104(%rsp),%r13
# (uint64) x13 >>= 32
shr $32,%r13
# x13 <<= 32
shl $32,%r13
# x12 += x13
add %r13,%r14
# (uint32) x0 += j0
addl 56(%rsp),%edx
# x1 <<= 32
shl $32,%rdi
# x1 += j0
addq 56(%rsp),%rdi
# (uint64) x1 >>= 32
shr $32,%rdi
# x1 <<= 32
shl $32,%rdi
# x0 += x1
add %rdi,%rdx
# x5 = x5_stack
movq 160(%rsp),%rdi
# (uint32) x4 += j4
addl 72(%rsp),%r9d
# x5 <<= 32
shl $32,%rdi
# x5 += j4
addq 72(%rsp),%rdi
# (uint64) x5 >>= 32
shr $32,%rdi
# x5 <<= 32
shl $32,%rdi
# x4 += x5
add %rdi,%r9
# x10 = x10_stack
movq 168(%rsp),%r8
# (uint32) x10 += j10
addl 96(%rsp),%r8d
# x11 <<= 32
shl $32,%r12
# x11 += j10
addq 96(%rsp),%r12
# (uint64) x11 >>= 32
shr $32,%r12
# x11 <<= 32
shl $32,%r12
# x10 += x11
add %r12,%r8
# x15 = x15_stack
movq 176(%rsp),%rdi
# (uint32) x14 += j14
addl 112(%rsp),%ebx
# x15 <<= 32
shl $32,%rdi
# x15 += j14
addq 112(%rsp),%rdi
# (uint64) x15 >>= 32
shr $32,%rdi
# x15 <<= 32
shl $32,%rdi
# x14 += x15
add %rdi,%rbx
# out = out_backup
movq 136(%rsp),%rdi
# m = m_backup
movq 144(%rsp),%rsi
# x0 ^= *(uint64 *) (m + 0)
xorq 0(%rsi),%rdx
# *(uint64 *) (out + 0) = x0
movq %rdx,0(%rdi)
# x2 ^= *(uint64 *) (m + 8)
xorq 8(%rsi),%rcx
# *(uint64 *) (out + 8) = x2
movq %rcx,8(%rdi)
# x4 ^= *(uint64 *) (m + 16)
xorq 16(%rsi),%r9
# *(uint64 *) (out + 16) = x4
movq %r9,16(%rdi)
# x6 ^= *(uint64 *) (m + 24)
xorq 24(%rsi),%rax
# *(uint64 *) (out + 24) = x6
movq %rax,24(%rdi)
# x8 ^= *(uint64 *) (m + 32)
xorq 32(%rsi),%r11
# *(uint64 *) (out + 32) = x8
movq %r11,32(%rdi)
# x10 ^= *(uint64 *) (m + 40)
xorq 40(%rsi),%r8
# *(uint64 *) (out + 40) = x10
movq %r8,40(%rdi)
# x12 ^= *(uint64 *) (m + 48)
xorq 48(%rsi),%r14
# *(uint64 *) (out + 48) = x12
movq %r14,48(%rdi)
# x14 ^= *(uint64 *) (m + 56)
xorq 56(%rsi),%rbx
# *(uint64 *) (out + 56) = x14
movq %rbx,56(%rdi)
# bytes = bytes_backup
movq 152(%rsp),%rdx
# in8 = j8
movq 88(%rsp),%rcx
# in8 += 1
add $1,%rcx
# j8 = in8
movq %rcx,88(%rsp)
# unsigned>? unsigned<? bytes - 64
cmp $64,%rdx
# comment:fp stack unchanged by jump
# goto bytesatleast65 if unsigned>
ja ._bytesatleast65
# comment:fp stack unchanged by jump
# goto bytesatleast64 if !unsigned<
jae ._bytesatleast64
# m = out
mov %rdi,%rsi
# out = ctarget
movq 128(%rsp),%rdi
# i = bytes
mov %rdx,%rcx
# while (i) { *out++ = *m++; --i }
rep movsb
# comment:fp stack unchanged by fallthrough
# bytesatleast64:
._bytesatleast64:
# x = x_backup
movq 120(%rsp),%rdi
# in8 = j8
movq 88(%rsp),%rsi
# *(uint64 *) (x + 32) = in8
movq %rsi,32(%rdi)
# r11 = r11_stack
movq 0(%rsp),%r11
# r12 = r12_stack
movq 8(%rsp),%r12
# r13 = r13_stack
movq 16(%rsp),%r13
# r14 = r14_stack
movq 24(%rsp),%r14
# r15 = r15_stack
movq 32(%rsp),%r15
# rbx = rbx_stack
movq 40(%rsp),%rbx
# rbp = rbp_stack
movq 48(%rsp),%rbp
# comment:fp stack unchanged by fallthrough
# done:
._done:
# leave
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret
# bytesatleast65:
._bytesatleast65:
# bytes -= 64
sub $64,%rdx
# out += 64
add $64,%rdi
# m += 64
add $64,%rsi
# comment:fp stack unchanged by jump
# goto bytesatleast1
jmp ._bytesatleast1
# enter ECRYPT_keysetup
.text
.p2align 5
.globl ECRYPT_keysetup
ECRYPT_keysetup:
mov %rsp,%r11
and $31,%r11
add $256,%r11
sub %r11,%rsp
# k = arg2
mov %rsi,%rsi
# kbits = arg3
mov %rdx,%rdx
# x = arg1
mov %rdi,%rdi
# in0 = *(uint64 *) (k + 0)
movq 0(%rsi),%r8
# in2 = *(uint64 *) (k + 8)
movq 8(%rsi),%r9
# *(uint64 *) (x + 4) = in0
movq %r8,4(%rdi)
# *(uint64 *) (x + 12) = in2
movq %r9,12(%rdi)
# unsigned<? kbits - 256
cmp $256,%rdx
# comment:fp stack unchanged by jump
# goto kbits128 if unsigned<
jb ._kbits128
# kbits256:
._kbits256:
# in10 = *(uint64 *) (k + 16)
movq 16(%rsi),%rdx
# in12 = *(uint64 *) (k + 24)
movq 24(%rsi),%rsi
# *(uint64 *) (x + 44) = in10
movq %rdx,44(%rdi)
# *(uint64 *) (x + 52) = in12
movq %rsi,52(%rdi)
# in0 = 1634760805
mov $1634760805,%rsi
# in4 = 857760878
mov $857760878,%rdx
# in10 = 2036477234
mov $2036477234,%rcx
# in14 = 1797285236
mov $1797285236,%r8
# *(uint32 *) (x + 0) = in0
movl %esi,0(%rdi)
# *(uint32 *) (x + 20) = in4
movl %edx,20(%rdi)
# *(uint32 *) (x + 40) = in10
movl %ecx,40(%rdi)
# *(uint32 *) (x + 60) = in14
movl %r8d,60(%rdi)
# comment:fp stack unchanged by jump
# goto keysetupdone
jmp ._keysetupdone
# kbits128:
._kbits128:
# in10 = *(uint64 *) (k + 0)
movq 0(%rsi),%rdx
# in12 = *(uint64 *) (k + 8)
movq 8(%rsi),%rsi
# *(uint64 *) (x + 44) = in10
movq %rdx,44(%rdi)
# *(uint64 *) (x + 52) = in12
movq %rsi,52(%rdi)
# in0 = 1634760805
mov $1634760805,%rsi
# in4 = 824206446
mov $824206446,%rdx
# in10 = 2036477238
mov $2036477238,%rcx
# in14 = 1797285236
mov $1797285236,%r8
# *(uint32 *) (x + 0) = in0
movl %esi,0(%rdi)
# *(uint32 *) (x + 20) = in4
movl %edx,20(%rdi)
# *(uint32 *) (x + 40) = in10
movl %ecx,40(%rdi)
# *(uint32 *) (x + 60) = in14
movl %r8d,60(%rdi)
# keysetupdone:
._keysetupdone:
# leave
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret
# enter ECRYPT_ivsetup
.text
.p2align 5
.globl ECRYPT_ivsetup
ECRYPT_ivsetup:
mov %rsp,%r11
and $31,%r11
add $256,%r11
sub %r11,%rsp
# iv = arg2
mov %rsi,%rsi
# x = arg1
mov %rdi,%rdi
# in6 = *(uint64 *) (iv + 0)
movq 0(%rsi),%rsi
# in8 = 0
mov $0,%r8
# *(uint64 *) (x + 24) = in6
movq %rsi,24(%rdi)
# *(uint64 *) (x + 32) = in8
movq %r8,32(%rdi)
# leave
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret

View File

@ -8,6 +8,8 @@
* and to remove extraneous comments and functions that are not needed.
* - i586 version, renamed as salsa20-i586-asm_32.S
* available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
* - x86-64 version, renamed as salsa20-x86_64-asm_64.S
* available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free

View File

@ -504,6 +504,21 @@ config CRYPTO_SALSA20_586
The Salsa20 stream cipher algorithm is designed by Daniel J.
Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
config CRYPTO_SALSA20_X86_64
tristate "Salsa20 stream cipher algorithm (x86_64) (EXPERIMENTAL)"
depends on (X86 || UML_X86) && 64BIT
depends on EXPERIMENTAL
select CRYPTO_BLKCIPHER
select CRYPTO_SALSA20
help
Salsa20 stream cipher algorithm.
Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
The Salsa20 stream cipher algorithm is designed by Daniel J.
Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
config CRYPTO_DEFLATE
tristate "Deflate compression algorithm"
select CRYPTO_ALGAPI