lib: unic: Optimize memcpy and memset for aarch64

Move onto using asmdefs macros in assembly files. Implement an
assembly-based memcpy for aarch64 and a C-optimized memset, while
also making place for replacing functions with more optimized ones
in the future.

Signed-off-by: Ivaylo Ivanov <ivo.ivanov.ivanov1@gmail.com>
This commit is contained in:
Ivaylo Ivanov 2024-10-09 18:49:23 +03:00
parent 81d26edaa3
commit 784bfd609a
7 changed files with 461 additions and 56 deletions

View File

@ -344,6 +344,10 @@ all: arch/$(ARCH)/linker.lds uniLoader
main-y := arch/$(ARCH)/start.o \
main/main.o
ifeq ($(ARCH), aarch64)
arch-libs-y := arch/$(ARCH)/memcpy.o
endif
# Object directories
objs-y := main
objs-y += arch
@ -354,8 +358,6 @@ libs-y += board
libs-y += lib
libs-y += drivers
# Include these in the build process as we
uniLoader-dirs := $(objs-y) $(libs-y)
uniLoader-objs := $(patsubst %,%/built-in.o, $(objs-y))
uniLoader-libs := $(patsubst %,%/lib.a, $(libs-y))
@ -367,7 +369,7 @@ uniLoader-all := $(uniLoader-objs) $(uniLoader-libs)
# Do modpost on a prelinked vmlinux. The finally linked vmlinux has
# relevant sections renamed as per the linker script.
quiet_cmd_uniLoader = LD $@.o
cmd_uniLoader = $(LD) $(main-y) $(uniLoader-libs) -o $@.o --script=arch/$(ARCH)/linker.lds
cmd_uniLoader = $(LD) $(main-y) $(arch-libs-y) $(uniLoader-libs) -o $@.o --script=arch/$(ARCH)/linker.lds
arch/$(ARCH)/linker.lds: arch/$(ARCH)/linker.lds.S $(KERNEL_PATH)
$(CPP) $< -DTEXT_BASE=$(TEXT_BASE) -DKERNEL_PATH=$(KERNEL_PATH) -DDTB_PATH=$(DT_PATH) -P -o $@

View File

@ -1 +1,4 @@
obj-y += $(ARCH)/start.o
# libs
obj-y += $(ARCH)/memcpy.o

106
arch/aarch64/asmdefs.h Normal file
View File

@ -0,0 +1,106 @@
/*
* Macros for asm code. AArch64 version.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _ASMDEFS_H
#define _ASMDEFS_H
/* Branch Target Identitication support. */
#define BTI_C hint 34
#define BTI_J hint 36
/* Return address signing support (pac-ret). */
#define PACIASP hint 25; .cfi_window_save
#define AUTIASP hint 29; .cfi_window_save
/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
#define FEATURE_1_AND 0xc0000000
#define FEATURE_1_BTI 1
#define FEATURE_1_PAC 2
/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
#ifdef __ILP32__
#define GNU_PROPERTY(type, value) \
.section .note.gnu.property, "a"; \
.p2align 2; \
.word 4; \
.word 12; \
.word 5; \
.asciz "GNU"; \
.word type; \
.word 4; \
.word value; \
.text
#else
#define GNU_PROPERTY(type, value) \
.section .note.gnu.property, "a"; \
.p2align 3; \
.word 4; \
.word 16; \
.word 5; \
.asciz "GNU"; \
.word type; \
.word 4; \
.word value; \
.word 0; \
.text
#endif
/* If set then the GNU Property Note section will be added to
mark objects to support BTI and PAC-RET. */
#ifndef WANT_GNU_PROPERTY
#define WANT_GNU_PROPERTY 1
#endif
#if WANT_GNU_PROPERTY
/* Add property note with supported features to all asm files. */
GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
#endif
#define ENTRY_ALIGN(name, alignment) \
.global name; \
.type name,%function; \
.align alignment; \
name: \
.cfi_startproc; \
BTI_C;
#define ENTRY(name) ENTRY_ALIGN(name, 6)
#define ENTRY_ALIAS(name) \
.global name; \
.type name,%function; \
name:
#define END(name) \
.cfi_endproc; \
.size name, .-name;
#define L(l) .L ## l
#ifdef __ILP32__
/* Sanitize padding bits of pointer arguments as per aapcs64 */
#define PTR_ARG(n) mov w##n, w##n
#else
#define PTR_ARG(n)
#endif
#ifdef __ILP32__
/* Sanitize padding bits of size arguments as per aapcs64 */
#define SIZE_ARG(n) mov w##n, w##n
#else
#define SIZE_ARG(n)
#endif
/* Compiler supports SVE instructions */
#ifndef HAVE_SVE
# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
# define HAVE_SVE 1
# else
# define HAVE_SVE 0
# endif
#endif
#endif

242
arch/aarch64/memcpy.S Normal file
View File

@ -0,0 +1,242 @@
/*
* memcpy - copy memory area
*
* Copyright (c) 2012-2022, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses.
*
*/
#include "asmdefs.h"
#define dstin x0
#define src x1
#define count x2
#define dst x3
#define srcend x4
#define dstend x5
#define A_l x6
#define A_lw w6
#define A_h x7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_l x10
#define C_lw w10
#define C_h x11
#define D_l x12
#define D_h x13
#define E_l x14
#define E_h x15
#define F_l x16
#define F_h x17
#define G_l count
#define G_h dst
#define H_l src
#define H_h srcend
#define tmp1 x14
/* This implementation handles overlaps and supports both memcpy and memmove
from a single entry point. It uses unaligned accesses and branchless
sequences to keep the code small, simple and improve performance.
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
copies of up to 128 bytes, and large copies. The overhead of the overlap
check is negligible since it is only required for large copies.
Large copies use a software pipelined loop processing 64 bytes per iteration.
The destination pointer is 16-byte aligned to minimize unaligned accesses.
The loop tail is handled by always copying 64 bytes from the end.
*/
ENTRY_ALIAS (memmove)
ENTRY (memcpy)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
b.hi L(copy_long)
cmp count, 32
b.hi L(copy32_128)
/* Small copies: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
ldp A_l, A_h, [src]
ldp D_l, D_h, [srcend, -16]
stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16]
ret
/* Copy 8-15 bytes. */
L(copy16):
tbz count, 3, L(copy8)
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
.p2align 3
/* Copy 4-7 bytes. */
L(copy8):
tbz count, 2, L(copy4)
ldr A_lw, [src]
ldr B_lw, [srcend, -4]
str A_lw, [dstin]
str B_lw, [dstend, -4]
ret
/* Copy 0..3 bytes using a branchless sequence. */
L(copy4):
cbz count, L(copy0)
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb C_lw, [dstend, -1]
L(copy0):
ret
.p2align 4
/* Medium copies: 33..128 bytes. */
L(copy32_128):
ldp A_l, A_h, [src]
ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [srcend, -32]
ldp D_l, D_h, [srcend, -16]
cmp count, 64
b.hi L(copy128)
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16]
ret
.p2align 4
/* Copy 65..128 bytes. */
L(copy128):
ldp E_l, E_h, [src, 32]
ldp F_l, F_h, [src, 48]
cmp count, 96
b.ls L(copy96)
ldp G_l, G_h, [srcend, -64]
ldp H_l, H_h, [srcend, -48]
stp G_l, G_h, [dstend, -64]
stp H_l, H_h, [dstend, -48]
L(copy96):
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
stp E_l, E_h, [dstin, 32]
stp F_l, F_h, [dstin, 48]
stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16]
ret
.p2align 4
/* Copy more than 128 bytes. */
L(copy_long):
/* Use backwards copy if there is an overlap. */
sub tmp1, dstin, src
cbz tmp1, L(copy0)
cmp tmp1, count
b.lo L(copy_long_backwards)
/* Copy 16 bytes and then align dst to 16-byte alignment. */
ldp D_l, D_h, [src]
and tmp1, dstin, 15
bic dst, dstin, 15
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
stp D_l, D_h, [dstin]
ldp B_l, B_h, [src, 32]
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(copy64_from_end)
L(loop64):
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32]
ldp B_l, B_h, [src, 32]
stp C_l, C_h, [dst, 48]
ldp C_l, C_h, [src, 48]
stp D_l, D_h, [dst, 64]!
ldp D_l, D_h, [src, 64]!
subs count, count, 64
b.hi L(loop64)
/* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end):
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend, -48]
stp B_l, B_h, [dst, 32]
ldp B_l, B_h, [srcend, -32]
stp C_l, C_h, [dst, 48]
ldp C_l, C_h, [srcend, -16]
stp D_l, D_h, [dst, 64]
stp E_l, E_h, [dstend, -64]
stp A_l, A_h, [dstend, -48]
stp B_l, B_h, [dstend, -32]
stp C_l, C_h, [dstend, -16]
ret
.p2align 4
/* Large backwards copy for overlapping copies.
Copy 16 bytes and then align dst to 16-byte alignment. */
L(copy_long_backwards):
ldp D_l, D_h, [srcend, -16]
and tmp1, dstend, 15
sub srcend, srcend, tmp1
sub count, count, tmp1
ldp A_l, A_h, [srcend, -16]
stp D_l, D_h, [dstend, -16]
ldp B_l, B_h, [srcend, -32]
ldp C_l, C_h, [srcend, -48]
ldp D_l, D_h, [srcend, -64]!
sub dstend, dstend, tmp1
subs count, count, 128
b.ls L(copy64_from_start)
L(loop64_backwards):
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [srcend, -16]
stp B_l, B_h, [dstend, -32]
ldp B_l, B_h, [srcend, -32]
stp C_l, C_h, [dstend, -48]
ldp C_l, C_h, [srcend, -48]
stp D_l, D_h, [dstend, -64]!
ldp D_l, D_h, [srcend, -64]!
subs count, count, 64
b.hi L(loop64_backwards)
/* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start):
ldp G_l, G_h, [src, 48]
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [src, 32]
stp B_l, B_h, [dstend, -32]
ldp B_l, B_h, [src, 16]
stp C_l, C_h, [dstend, -48]
ldp C_l, C_h, [src]
stp D_l, D_h, [dstend, -64]
stp G_l, G_h, [dstin, 48]
stp A_l, A_h, [dstin, 32]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
ret
END (memcpy)

View File

@ -2,10 +2,10 @@
/*
* Copyright (c) 2022, Ivaylo Ivanov <ivo.ivanov.ivanov1@gmail.com>
*/
.include "arch/aarch64/macros.h"
#include "asmdefs.h"
.section .text
ENTRY _start
ENTRY(_start)
/* Set up the base address for the stack */
adrp x0, _stack_end /* Load page of the _stack_end symbol */
add x0, x0, #:lo12:_stack_end /* Add the offset within the page */
@ -20,8 +20,8 @@ ENTRY _start
add x1, x1, #:lo12:kernel /* Add offset within the page */
bl main
ENDPROC _start
END(_start)
ENTRY load_kernel
ENTRY(load_kernel)
br x4
ENDPROC load_kernel
END(load_kernel)

View File

@ -6,37 +6,54 @@
#include "stdbool.h"
#include "string.h"
void *memcpy(void *s1, const void *s2, size_t n)
void *memset (void *m, int c, size_t n)
{
char *dest = (char *)s1;
const char *src = (const char *)s2;
char *s = (char *) m;
while (n--) {
*dest++ = *src++;
unsigned int i;
unsigned long buffer;
unsigned long *aligned_addr;
unsigned int d = c & 0xff;
while (UNALIGNED (s)) {
if (n--)
*s++ = (char) c;
else
return m;
}
return s1;
}
if (!TOO_SMALL (n)) {
/* If we get this far, we know that n is large and s is word-aligned. */
aligned_addr = (unsigned long *) s;
void *memmove(void *s1, const void *s2, size_t n)
{
char *dest = (char *)s1;
const char *src = (const char *)s2;
/*
* Store D into each char sized location in BUFFER so that
* we can set large blocks quickly.
*/
buffer = (d << 8) | d;
buffer |= (buffer << 16);
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
buffer = (buffer << i) | buffer;
if (dest <= src) {
while (n--) {
*dest++ = *src++;
/* Unroll the loop. */
while (n >= LBLOCKSIZE*4) {
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
n -= 4*LBLOCKSIZE;
}
} else {
src += n;
dest += n;
while (n--) {
*--dest = *--src;
while (n >= LBLOCKSIZE) {
*aligned_addr++ = buffer;
n -= LBLOCKSIZE;
}
/* Pick up the remainder with a bytewise loop. */
s = (char*)aligned_addr;
}
return s1;
return m;
}
int memcmp(const void *s1, const void *s2, size_t n)
@ -56,17 +73,6 @@ int memcmp(const void *s1, const void *s2, size_t n)
return 0;
}
void *memset(void *s, int c, size_t n)
{
unsigned char *p = (unsigned char *)s;
while (n--) {
*p++ = (unsigned char)c;
}
return s;
}
void *memchr(const void *s, int c, size_t n)
{
const unsigned char *p = (const unsigned char *)s;

View File

@ -8,23 +8,69 @@
#include "stddef.h"
void *memcpy(void *s1, const void *s2, size_t n);
void *memmove(void *s1, const void *s2, size_t n);
int memcmp(const void *s1, const void *s2, size_t n);
void *memset(void *s, int c, size_t n);
void *memchr(const void *s, int c, size_t n);
char *strcpy(char *s1, const char *s2);
char *strncpy(char *s1, const char *s2, size_t n);
int strcmp(const char *s1, const char *s2);
int strncmp(const char *s1, const char *s2, size_t n);
size_t strlen(const char *s);
size_t strnlen(const char *s, size_t n);
char *strchr(const char *s, int c);
char *strrchr(const char *s, int c);
long atol(const char *s);
void writel(unsigned int value, void* address);
#define LBLOCKSIZE (sizeof(long))
#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
static inline int tolower(int c)
// C-driven unoptimized functions
int memcmp (const void *s1, const void *s2, size_t n);
void *memchr (const void *s, int c, size_t n);
char *strcpy (char *s1, const char *s2);
char *strncpy (char *s1, const char *s2, size_t n);
int strcmp (const char *s1, const char *s2);
int strncmp (const char *s1, const char *s2, size_t n);
size_t strlen (const char *s);
size_t strnlen (const char *s, size_t n);
char *strchr (const char *s, int c);
char *strrchr (const char *s, int c);
long atol (const char *s);
void writel (unsigned int value, void* address);
// C-driven optimized functions
void *memset (void *m, int c, size_t n);
// Assembly-driven functions
#ifdef __aarch64__
void *memcpy (void *__restrict, const void *__restrict, size_t);
void *memmove (void *, const void *, size_t);
#endif
#ifdef __arm__
void *memcpy (void *s1, const void *s2, size_t n)
{
char *dest = (char *)s1;
const char *src = (const char *)s2;
while (n--) {
*dest++ = *src++;
}
return s1;
}
void *memmove (void *s1, const void *s2, size_t n)
{
char *dest = (char *)s1;
const char *src = (const char *)s2;
if (dest <= src) {
while (n--) {
*dest++ = *src++;
}
} else {
src += n;
dest += n;
while (n--) {
*--dest = *--src;
}
}
return s1;
}
#endif
static inline int tolower (int c)
{
if (c >= 'A' && c <= 'Z')
return c - 'A' + 'a';
@ -32,7 +78,7 @@ static inline int tolower(int c)
return c;
}
static inline int toupper(int c)
static inline int toupper (int c)
{
if (c >= 'a' && c <= 'z')
return c - 'a' + 'A';