mirror of
https://github.com/torvalds/linux.git
synced 2024-11-23 12:42:02 +00:00
crypto: arm/chacha20 - always use vrev for 16-bit rotates
The 4-way ChaCha20 NEON code implements 16-bit rotates with vrev32.16, but the one-way code (used on remainder blocks) implements it with vshl + vsri, which is slower. Switch the one-way code to vrev32.16 too. Signed-off-by: Eric Biggers <ebiggers@google.com> Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
f53ad3e1b3
commit
4e34e51f48
@ -51,9 +51,8 @@ ENTRY(chacha20_block_xor_neon)
|
|||||||
.Ldoubleround:
|
.Ldoubleround:
|
||||||
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||||
vadd.i32 q0, q0, q1
|
vadd.i32 q0, q0, q1
|
||||||
veor q4, q3, q0
|
veor q3, q3, q0
|
||||||
vshl.u32 q3, q4, #16
|
vrev32.16 q3, q3
|
||||||
vsri.u32 q3, q4, #16
|
|
||||||
|
|
||||||
// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||||
vadd.i32 q2, q2, q3
|
vadd.i32 q2, q2, q3
|
||||||
@ -82,9 +81,8 @@ ENTRY(chacha20_block_xor_neon)
|
|||||||
|
|
||||||
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||||
vadd.i32 q0, q0, q1
|
vadd.i32 q0, q0, q1
|
||||||
veor q4, q3, q0
|
veor q3, q3, q0
|
||||||
vshl.u32 q3, q4, #16
|
vrev32.16 q3, q3
|
||||||
vsri.u32 q3, q4, #16
|
|
||||||
|
|
||||||
// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||||
vadd.i32 q2, q2, q3
|
vadd.i32 q2, q2, q3
|
||||||
|
Loading…
Reference in New Issue
Block a user