linux/lib/raid6/neon.uc

/* -----------------------------------------------------------------------
 *
 *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
 *
 *   Copyright (C) 2012 Rob Herring
 *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 *   Based on altivec.uc:
 *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 *   (at your option) any later version; incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */

/*
 * neon$#.c
 *
 * $#-way unrolled NEON intrinsics math RAID-6 instruction set
 *
 * This file is postprocessed using unroll.awk
 */

#include <arm_neon.h>
#include "neon.h"

typedef uint8x16_t unative_t;

#define NSIZE	sizeof(unative_t)

/*
 * The SHLBYTE() operation shifts each byte left by 1, *not*
 * rolling over into the next byte
 */
static inline unative_t SHLBYTE(unative_t v)
{
	return vshlq_n_u8(v, 1);
}

/*
 * The MASK() operation returns 0xFF in any byte for which the high
 * bit is 1, 0x00 for any byte for which the high bit is 0.
 */
static inline unative_t MASK(unative_t v)
{
	return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
}

static inline unative_t PMUL(unative_t v, unative_t u)
{
	return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
}

void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
{
	uint8_t **dptr = (uint8_t **)ptrs;
	uint8_t *p, *q;
	int d, z, z0;

	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
	const unative_t x1d = vdupq_n_u8(0x1d);

	z0 = disks - 3;		/* Highest data disk */
	p = dptr[z0+1];		/* XOR parity */
	q = dptr[z0+2];		/* RS syndrome */

	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
		wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
		for ( z = z0-1 ; z >= 0 ; z-- ) {
			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
			wp$$ = veorq_u8(wp$$, wd$$);
			w2$$ = MASK(wq$$);
			w1$$ = SHLBYTE(wq$$);

			w2$$ = vandq_u8(w2$$, x1d);
			w1$$ = veorq_u8(w1$$, w2$$);
			wq$$ = veorq_u8(w1$$, wd$$);
		}
		vst1q_u8(&p[d+NSIZE*$$], wp$$);
		vst1q_u8(&q[d+NSIZE*$$], wq$$);
	}
}

void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
				    unsigned long bytes, void **ptrs)
{
	uint8_t **dptr = (uint8_t **)ptrs;
	uint8_t *p, *q;
	int d, z, z0;

	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
	const unative_t x1d = vdupq_n_u8(0x1d);

	z0 = stop;		/* P/Q right side optimization */
	p = dptr[disks-2];	/* XOR parity */
	q = dptr[disks-1];	/* RS syndrome */

	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
		wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
		wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);

		/* P/Q data pages */
		for ( z = z0-1 ; z >= start ; z-- ) {
			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
			wp$$ = veorq_u8(wp$$, wd$$);
			w2$$ = MASK(wq$$);
			w1$$ = SHLBYTE(wq$$);

			w2$$ = vandq_u8(w2$$, x1d);
			w1$$ = veorq_u8(w1$$, w2$$);
			wq$$ = veorq_u8(w1$$, wd$$);
		}
		/* P/Q left side optimization */
		for ( z = start-1 ; z >= 3 ; z -= 4 ) {
			w2$$ = vshrq_n_u8(wq$$, 4);
			w1$$ = vshlq_n_u8(wq$$, 4);

			w2$$ = PMUL(w2$$, x1d);
			wq$$ = veorq_u8(w1$$, w2$$);
		}

		switch (z) {
		case 2:
			w2$$ = vshrq_n_u8(wq$$, 5);
			w1$$ = vshlq_n_u8(wq$$, 3);

			w2$$ = PMUL(w2$$, x1d);
			wq$$ = veorq_u8(w1$$, w2$$);
			break;
		case 1:
			w2$$ = vshrq_n_u8(wq$$, 6);
			w1$$ = vshlq_n_u8(wq$$, 2);

			w2$$ = PMUL(w2$$, x1d);
			wq$$ = veorq_u8(w1$$, w2$$);
			break;
		case 0:
			w2$$ = MASK(wq$$);
			w1$$ = SHLBYTE(wq$$);

			w2$$ = vandq_u8(w2$$, x1d);
			wq$$ = veorq_u8(w1$$, w2$$);
		}
		w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
		wq$$ = veorq_u8(wq$$, w1$$);

		vst1q_u8(&p[d+NSIZE*$$], wp$$);
		vst1q_u8(&q[d+NSIZE*$$], wq$$);
	}
}
lib/raid6: add ARM-NEON accelerated syndrome calculation Rebased/reworked a patch contributed by Rob Herring that uses NEON intrinsics to perform the RAID-6 syndrome calculations. It uses the existing unroll.awk code to generate several unrolled versions of which the best performing one is selected at boot time. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Nicolas Pitre <nico@linaro.org> Cc: hpa@linux.intel.com 2013-05-16 15:20:32 +00:00			`/* -----------------------------------------------------------------------`
			`*`
			`* neon.uc - RAID-6 syndrome calculation using ARM NEON instructions`
			`*`
			`* Copyright (C) 2012 Rob Herring`
md/raid6: delta syndrome for ARM NEON This implements XOR syndrome calculation using NEON intrinsics. As before, the module can be built for ARM and arm64 from the same source. Relative performance on a Cortex-A57 based system: raid6: int64x1 gen() 905 MB/s raid6: int64x1 xor() 881 MB/s raid6: int64x2 gen() 1343 MB/s raid6: int64x2 xor() 1286 MB/s raid6: int64x4 gen() 1896 MB/s raid6: int64x4 xor() 1321 MB/s raid6: int64x8 gen() 1773 MB/s raid6: int64x8 xor() 1165 MB/s raid6: neonx1 gen() 1834 MB/s raid6: neonx1 xor() 1278 MB/s raid6: neonx2 gen() 2528 MB/s raid6: neonx2 xor() 1942 MB/s raid6: neonx4 gen() 2888 MB/s raid6: neonx4 xor() 2334 MB/s raid6: neonx8 gen() 2957 MB/s raid6: neonx8 xor() 2232 MB/s raid6: using algorithm neonx8 gen() 2957 MB/s raid6: .... xor() 2232 MB/s, rmw enabled Cc: Markus Stockhausen <stockhausen@collogia.de> Cc: Neil Brown <neilb@suse.de> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: NeilBrown <neilb@suse.com> 2015-07-01 02:19:56 +00:00			`* Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>`
lib/raid6: add ARM-NEON accelerated syndrome calculation Rebased/reworked a patch contributed by Rob Herring that uses NEON intrinsics to perform the RAID-6 syndrome calculations. It uses the existing unroll.awk code to generate several unrolled versions of which the best performing one is selected at boot time. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Nicolas Pitre <nico@linaro.org> Cc: hpa@linux.intel.com 2013-05-16 15:20:32 +00:00			`*`
			`* Based on altivec.uc:`
			`* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved`
			`*`
			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, Inc., 53 Temple Place Ste 330,`
			`* Boston MA 02111-1307, USA; either version 2 of the License, or`
			`* (at your option) any later version; incorporated herein by reference.`
			`*`
			`* ----------------------------------------------------------------------- */`

			`/*`
			`* neon$#.c`
			`*`
			`* $#-way unrolled NEON intrinsics math RAID-6 instruction set`
			`*`
			`* This file is postprocessed using unroll.awk`
			`*/`

			`#include <arm_neon.h>`
raid6: neon: add missing prototypes The raid6 syndrome functions are generated for different sizes and have no generic prototype, while in the inner functions have a prototype in a header that cannot be included from the correct file. In both cases, the compiler warns about missing prototypes: lib/raid6/recov_neon_inner.c:27:6: warning: no previous prototype for '__raid6_2data_recov_neon' [-Wmissing-prototypes] lib/raid6/recov_neon_inner.c:77:6: warning: no previous prototype for '__raid6_datap_recov_neon' [-Wmissing-prototypes] lib/raid6/neon1.c:56:6: warning: no previous prototype for 'raid6_neon1_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon1.c:86:6: warning: no previous prototype for 'raid6_neon1_xor_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon2.c:56:6: warning: no previous prototype for 'raid6_neon2_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon2.c:97:6: warning: no previous prototype for 'raid6_neon2_xor_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon4.c:56:6: warning: no previous prototype for 'raid6_neon4_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon4.c:119:6: warning: no previous prototype for 'raid6_neon4_xor_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon8.c:56:6: warning: no previous prototype for 'raid6_neon8_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon8.c:163:6: warning: no previous prototype for 'raid6_neon8_xor_syndrome_real' [-Wmissing-prototypes] Add a new header file that contains the prototypes for both to avoid the warnings. Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20230517132220.937200-1-arnd@kernel.org 2023-05-17 13:22:12 +00:00			`#include "neon.h"`
lib/raid6: add ARM-NEON accelerated syndrome calculation Rebased/reworked a patch contributed by Rob Herring that uses NEON intrinsics to perform the RAID-6 syndrome calculations. It uses the existing unroll.awk code to generate several unrolled versions of which the best performing one is selected at boot time. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Nicolas Pitre <nico@linaro.org> Cc: hpa@linux.intel.com 2013-05-16 15:20:32 +00:00
			`typedef uint8x16_t unative_t;`

			`#define NSIZE sizeof(unative_t)`

			`/*`
			`* The SHLBYTE() operation shifts each byte left by 1, not`
			`* rolling over into the next byte`
			`*/`
			`static inline unative_t SHLBYTE(unative_t v)`
			`{`
			`return vshlq_n_u8(v, 1);`
			`}`

			`/*`
			`* The MASK() operation returns 0xFF in any byte for which the high`
			`* bit is 1, 0x00 for any byte for which the high bit is 0.`
			`*/`
			`static inline unative_t MASK(unative_t v)`
			`{`
md/raid6: use faster multiplication for ARM NEON delta syndrome The P/Q left side optimization in the delta syndrome simply involves repeatedly multiplying a value by polynomial 'x' in GF(2^8). Given that 'x * x * x * x' equals 'x^4' even in the polynomial world, we can accelerate this substantially by performing up to 4 such operations at once, using the NEON instructions for polynomial multiplication. Results on a Cortex-A57 running in 64-bit mode: Before: ------- raid6: neonx1 xor() 1680 MB/s raid6: neonx2 xor() 2286 MB/s raid6: neonx4 xor() 3162 MB/s raid6: neonx8 xor() 3389 MB/s After: ------ raid6: neonx1 xor() 2281 MB/s raid6: neonx2 xor() 3362 MB/s raid6: neonx4 xor() 3787 MB/s raid6: neonx8 xor() 4239 MB/s While we're at it, simplify MASK() by using a signed shift rather than a vector compare involving a temp register. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 17:16:00 +00:00			`return (unative_t)vshrq_n_s8((int8x16_t)v, 7);`
			`}`

			`static inline unative_t PMUL(unative_t v, unative_t u)`
			`{`
			`return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);`
lib/raid6: add ARM-NEON accelerated syndrome calculation Rebased/reworked a patch contributed by Rob Herring that uses NEON intrinsics to perform the RAID-6 syndrome calculations. It uses the existing unroll.awk code to generate several unrolled versions of which the best performing one is selected at boot time. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Nicolas Pitre <nico@linaro.org> Cc: hpa@linux.intel.com 2013-05-16 15:20:32 +00:00			`}`

			`void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)`
			`{`
			`uint8_t dptr = (uint8_t )ptrs;`
			`uint8_t p, q;`
			`int d, z, z0;`

			`register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;`
lib/raid6: use vdupq_n_u8 to avoid endianness warnings Clang warns: vector initializers are not compatible with NEON intrinsics in big endian mode [-Wnonportable-vector-initialization] While this is usually the case, it's not an issue for this case since we're initializing the uint8x16_t (16x uint8_t's) with the same value. Instead, use vdupq_n_u8 which both compilers lower into a single movi instruction: https://godbolt.org/z/vBrgzt This avoids the static storage for a constant value. Link: https://github.com/ClangBuiltLinux/linux/issues/214 Suggested-by: Nathan Chancellor <natechancellor@gmail.com> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2019-02-26 04:03:42 +00:00			`const unative_t x1d = vdupq_n_u8(0x1d);`
lib/raid6: add ARM-NEON accelerated syndrome calculation Rebased/reworked a patch contributed by Rob Herring that uses NEON intrinsics to perform the RAID-6 syndrome calculations. It uses the existing unroll.awk code to generate several unrolled versions of which the best performing one is selected at boot time. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Nicolas Pitre <nico@linaro.org> Cc: hpa@linux.intel.com 2013-05-16 15:20:32 +00:00
			`z0 = disks - 3; /* Highest data disk */`
			`p = dptr[z0+1]; /* XOR parity */`
			`q = dptr[z0+2]; /* RS syndrome */`

			`for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {`
			`wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);`
			`for ( z = z0-1 ; z >= 0 ; z-- ) {`
			`wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);`
			`wp$$ = veorq_u8(wp$$, wd$$);`
			`w2$$ = MASK(wq$$);`
			`w1$$ = SHLBYTE(wq$$);`

			`w2$$ = vandq_u8(w2$$, x1d);`
			`w1$$ = veorq_u8(w1$$, w2$$);`
			`wq$$ = veorq_u8(w1$$, wd$$);`
			`}`
			`vst1q_u8(&p[d+NSIZE*$$], wp$$);`
			`vst1q_u8(&q[d+NSIZE*$$], wq$$);`
			`}`
			`}`
md/raid6: delta syndrome for ARM NEON This implements XOR syndrome calculation using NEON intrinsics. As before, the module can be built for ARM and arm64 from the same source. Relative performance on a Cortex-A57 based system: raid6: int64x1 gen() 905 MB/s raid6: int64x1 xor() 881 MB/s raid6: int64x2 gen() 1343 MB/s raid6: int64x2 xor() 1286 MB/s raid6: int64x4 gen() 1896 MB/s raid6: int64x4 xor() 1321 MB/s raid6: int64x8 gen() 1773 MB/s raid6: int64x8 xor() 1165 MB/s raid6: neonx1 gen() 1834 MB/s raid6: neonx1 xor() 1278 MB/s raid6: neonx2 gen() 2528 MB/s raid6: neonx2 xor() 1942 MB/s raid6: neonx4 gen() 2888 MB/s raid6: neonx4 xor() 2334 MB/s raid6: neonx8 gen() 2957 MB/s raid6: neonx8 xor() 2232 MB/s raid6: using algorithm neonx8 gen() 2957 MB/s raid6: .... xor() 2232 MB/s, rmw enabled Cc: Markus Stockhausen <stockhausen@collogia.de> Cc: Neil Brown <neilb@suse.de> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: NeilBrown <neilb@suse.com> 2015-07-01 02:19:56 +00:00
			`void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,`
			`unsigned long bytes, void **ptrs)`
			`{`
			`uint8_t dptr = (uint8_t )ptrs;`
			`uint8_t p, q;`
			`int d, z, z0;`

			`register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;`
lib/raid6: use vdupq_n_u8 to avoid endianness warnings Clang warns: vector initializers are not compatible with NEON intrinsics in big endian mode [-Wnonportable-vector-initialization] While this is usually the case, it's not an issue for this case since we're initializing the uint8x16_t (16x uint8_t's) with the same value. Instead, use vdupq_n_u8 which both compilers lower into a single movi instruction: https://godbolt.org/z/vBrgzt This avoids the static storage for a constant value. Link: https://github.com/ClangBuiltLinux/linux/issues/214 Suggested-by: Nathan Chancellor <natechancellor@gmail.com> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2019-02-26 04:03:42 +00:00			`const unative_t x1d = vdupq_n_u8(0x1d);`
md/raid6: delta syndrome for ARM NEON This implements XOR syndrome calculation using NEON intrinsics. As before, the module can be built for ARM and arm64 from the same source. Relative performance on a Cortex-A57 based system: raid6: int64x1 gen() 905 MB/s raid6: int64x1 xor() 881 MB/s raid6: int64x2 gen() 1343 MB/s raid6: int64x2 xor() 1286 MB/s raid6: int64x4 gen() 1896 MB/s raid6: int64x4 xor() 1321 MB/s raid6: int64x8 gen() 1773 MB/s raid6: int64x8 xor() 1165 MB/s raid6: neonx1 gen() 1834 MB/s raid6: neonx1 xor() 1278 MB/s raid6: neonx2 gen() 2528 MB/s raid6: neonx2 xor() 1942 MB/s raid6: neonx4 gen() 2888 MB/s raid6: neonx4 xor() 2334 MB/s raid6: neonx8 gen() 2957 MB/s raid6: neonx8 xor() 2232 MB/s raid6: using algorithm neonx8 gen() 2957 MB/s raid6: .... xor() 2232 MB/s, rmw enabled Cc: Markus Stockhausen <stockhausen@collogia.de> Cc: Neil Brown <neilb@suse.de> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: NeilBrown <neilb@suse.com> 2015-07-01 02:19:56 +00:00
			`z0 = stop; /* P/Q right side optimization */`
			`p = dptr[disks-2]; /* XOR parity */`
			`q = dptr[disks-1]; /* RS syndrome */`

			`for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {`
			`wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);`
			`wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);`

			`/* P/Q data pages */`
			`for ( z = z0-1 ; z >= start ; z-- ) {`
			`wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);`
			`wp$$ = veorq_u8(wp$$, wd$$);`
			`w2$$ = MASK(wq$$);`
			`w1$$ = SHLBYTE(wq$$);`

			`w2$$ = vandq_u8(w2$$, x1d);`
			`w1$$ = veorq_u8(w1$$, w2$$);`
			`wq$$ = veorq_u8(w1$$, wd$$);`
			`}`
			`/* P/Q left side optimization */`
md/raid6: use faster multiplication for ARM NEON delta syndrome The P/Q left side optimization in the delta syndrome simply involves repeatedly multiplying a value by polynomial 'x' in GF(2^8). Given that 'x * x * x * x' equals 'x^4' even in the polynomial world, we can accelerate this substantially by performing up to 4 such operations at once, using the NEON instructions for polynomial multiplication. Results on a Cortex-A57 running in 64-bit mode: Before: ------- raid6: neonx1 xor() 1680 MB/s raid6: neonx2 xor() 2286 MB/s raid6: neonx4 xor() 3162 MB/s raid6: neonx8 xor() 3389 MB/s After: ------ raid6: neonx1 xor() 2281 MB/s raid6: neonx2 xor() 3362 MB/s raid6: neonx4 xor() 3787 MB/s raid6: neonx8 xor() 4239 MB/s While we're at it, simplify MASK() by using a signed shift rather than a vector compare involving a temp register. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 17:16:00 +00:00			`for ( z = start-1 ; z >= 3 ; z -= 4 ) {`
			`w2$$ = vshrq_n_u8(wq$$, 4);`
			`w1$$ = vshlq_n_u8(wq$$, 4);`

			`w2$$ = PMUL(w2$$, x1d);`
			`wq$$ = veorq_u8(w1$$, w2$$);`
			`}`

			`switch (z) {`
			`case 2:`
			`w2$$ = vshrq_n_u8(wq$$, 5);`
			`w1$$ = vshlq_n_u8(wq$$, 3);`

			`w2$$ = PMUL(w2$$, x1d);`
			`wq$$ = veorq_u8(w1$$, w2$$);`
			`break;`
			`case 1:`
			`w2$$ = vshrq_n_u8(wq$$, 6);`
			`w1$$ = vshlq_n_u8(wq$$, 2);`

			`w2$$ = PMUL(w2$$, x1d);`
			`wq$$ = veorq_u8(w1$$, w2$$);`
			`break;`
			`case 0:`
md/raid6: delta syndrome for ARM NEON This implements XOR syndrome calculation using NEON intrinsics. As before, the module can be built for ARM and arm64 from the same source. Relative performance on a Cortex-A57 based system: raid6: int64x1 gen() 905 MB/s raid6: int64x1 xor() 881 MB/s raid6: int64x2 gen() 1343 MB/s raid6: int64x2 xor() 1286 MB/s raid6: int64x4 gen() 1896 MB/s raid6: int64x4 xor() 1321 MB/s raid6: int64x8 gen() 1773 MB/s raid6: int64x8 xor() 1165 MB/s raid6: neonx1 gen() 1834 MB/s raid6: neonx1 xor() 1278 MB/s raid6: neonx2 gen() 2528 MB/s raid6: neonx2 xor() 1942 MB/s raid6: neonx4 gen() 2888 MB/s raid6: neonx4 xor() 2334 MB/s raid6: neonx8 gen() 2957 MB/s raid6: neonx8 xor() 2232 MB/s raid6: using algorithm neonx8 gen() 2957 MB/s raid6: .... xor() 2232 MB/s, rmw enabled Cc: Markus Stockhausen <stockhausen@collogia.de> Cc: Neil Brown <neilb@suse.de> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: NeilBrown <neilb@suse.com> 2015-07-01 02:19:56 +00:00			`w2$$ = MASK(wq$$);`
			`w1$$ = SHLBYTE(wq$$);`

			`w2$$ = vandq_u8(w2$$, x1d);`
			`wq$$ = veorq_u8(w1$$, w2$$);`
			`}`
			`w1$$ = vld1q_u8(&q[d+NSIZE*$$]);`
			`wq$$ = veorq_u8(wq$$, w1$$);`

			`vst1q_u8(&p[d+NSIZE*$$], wp$$);`
			`vst1q_u8(&q[d+NSIZE*$$], wq$$);`
			`}`
			`}`