From 9e36fa42995a7c7fa8d84a429ca647736c4f1c66 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:10 +0800 Subject: [PATCH 01/24] LoongArch: Clean up the architectural interrupt definitions While interrupts are assigned ECodes `64 + interrupt number`, all existing use sites of interrupt numbers want the 64 subtracted. Re-arrange the definitions so that the actual interrupt number is used everywhere, and make EXCCODE_INT_END inclusive as it is more intuitive that way. While at it, according to the asm/loongarch.h definitions, the total number of architectural interrupts should be 14, but various other places indicate otherwise (13 or 15). Those places have been adjusted to 14 as well for consistency. Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/loongarch.h | 47 ++++++++++++++------------ arch/loongarch/kernel/irq.c | 2 +- arch/loongarch/kernel/perf_event.c | 2 +- arch/loongarch/kernel/time.c | 2 +- arch/loongarch/kernel/traps.c | 2 +- 5 files changed, 29 insertions(+), 26 deletions(-) diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 83da5d29e2d1..7d8b83dd32e9 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -311,8 +311,8 @@ static __always_inline void iocsr_write64(u64 val, u32 reg) #define CSR_ECFG_VS_WIDTH 3 #define CSR_ECFG_VS (_ULCAST_(0x7) << CSR_ECFG_VS_SHIFT) #define CSR_ECFG_IM_SHIFT 0 -#define CSR_ECFG_IM_WIDTH 13 -#define CSR_ECFG_IM (_ULCAST_(0x1fff) << CSR_ECFG_IM_SHIFT) +#define CSR_ECFG_IM_WIDTH 14 +#define CSR_ECFG_IM (_ULCAST_(0x3fff) << CSR_ECFG_IM_SHIFT) #define LOONGARCH_CSR_ESTAT 0x5 /* Exception status */ #define CSR_ESTAT_ESUBCODE_SHIFT 22 @@ -322,8 +322,8 @@ static __always_inline void iocsr_write64(u64 val, u32 reg) #define CSR_ESTAT_EXC_WIDTH 6 #define CSR_ESTAT_EXC (_ULCAST_(0x3f) << CSR_ESTAT_EXC_SHIFT) #define CSR_ESTAT_IS_SHIFT 0 -#define CSR_ESTAT_IS_WIDTH 15 -#define CSR_ESTAT_IS (_ULCAST_(0x7fff) << CSR_ESTAT_IS_SHIFT) +#define CSR_ESTAT_IS_WIDTH 14 +#define CSR_ESTAT_IS (_ULCAST_(0x3fff) << CSR_ESTAT_IS_SHIFT) #define LOONGARCH_CSR_ERA 0x6 /* ERA */ @@ -1090,7 +1090,7 @@ static __always_inline void iocsr_write64(u64 val, u32 reg) #define ECFGF_IPI (_ULCAST_(1) << ECFGB_IPI) #define ECFGF(hwirq) (_ULCAST_(1) << hwirq) -#define ESTATF_IP 0x00001fff +#define ESTATF_IP 0x00003fff #define LOONGARCH_IOCSR_FEATURES 0x8 #define IOCSRF_TEMP BIT_ULL(0) @@ -1418,23 +1418,26 @@ __BUILD_CSR_OP(tlbidx) #define EXCSUBCODE_GCHC 1 /* Hardware caused */ #define EXCCODE_SE 25 /* Security */ -#define EXCCODE_INT_START 64 -#define EXCCODE_SIP0 64 -#define EXCCODE_SIP1 65 -#define EXCCODE_IP0 66 -#define EXCCODE_IP1 67 -#define EXCCODE_IP2 68 -#define EXCCODE_IP3 69 -#define EXCCODE_IP4 70 -#define EXCCODE_IP5 71 -#define EXCCODE_IP6 72 -#define EXCCODE_IP7 73 -#define EXCCODE_PMC 74 /* Performance Counter */ -#define EXCCODE_TIMER 75 -#define EXCCODE_IPI 76 -#define EXCCODE_NMI 77 -#define EXCCODE_INT_END 78 -#define EXCCODE_INT_NUM (EXCCODE_INT_END - EXCCODE_INT_START) +/* Interrupt numbers */ +#define INT_SWI0 0 /* Software Interrupts */ +#define INT_SWI1 1 +#define INT_HWI0 2 /* Hardware Interrupts */ +#define INT_HWI1 3 +#define INT_HWI2 4 +#define INT_HWI3 5 +#define INT_HWI4 6 +#define INT_HWI5 7 +#define INT_HWI6 8 +#define INT_HWI7 9 +#define INT_PCOV 10 /* Performance Counter Overflow */ +#define INT_TI 11 /* Timer */ +#define INT_IPI 12 +#define INT_NMI 13 + +/* ExcCodes corresponding to interrupts */ +#define EXCCODE_INT_NUM (INT_NMI + 1) +#define EXCCODE_INT_START 64 +#define EXCCODE_INT_END (EXCCODE_INT_START + EXCCODE_INT_NUM - 1) /* FPU register names */ #define LOONGARCH_FCSR0 $r0 diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c index 0524bf1169b7..883e5066ae44 100644 --- a/arch/loongarch/kernel/irq.c +++ b/arch/loongarch/kernel/irq.c @@ -92,7 +92,7 @@ static int __init get_ipi_irq(void) struct irq_domain *d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY); if (d) - return irq_create_mapping(d, EXCCODE_IPI - EXCCODE_INT_START); + return irq_create_mapping(d, INT_IPI); return -EINVAL; } diff --git a/arch/loongarch/kernel/perf_event.c b/arch/loongarch/kernel/perf_event.c index 707bd32e5c4f..ff28f99b47d7 100644 --- a/arch/loongarch/kernel/perf_event.c +++ b/arch/loongarch/kernel/perf_event.c @@ -461,7 +461,7 @@ static int get_pmc_irq(void) struct irq_domain *d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY); if (d) - return irq_create_mapping(d, EXCCODE_PMC - EXCCODE_INT_START); + return irq_create_mapping(d, INT_PCOV); return -EINVAL; } diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index 4351f69d9950..f377e50f3c66 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -133,7 +133,7 @@ static int get_timer_irq(void) struct irq_domain *d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY); if (d) - return irq_create_mapping(d, EXCCODE_TIMER - EXCCODE_INT_START); + return irq_create_mapping(d, INT_TI); return -EINVAL; } diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index de8ebe20b666..3fb4709c3b19 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -792,7 +792,7 @@ void __init trap_init(void) long i; /* Set interrupt vector handler */ - for (i = EXCCODE_INT_START; i < EXCCODE_INT_END; i++) + for (i = EXCCODE_INT_START; i <= EXCCODE_INT_END; i++) set_handler(i * VECSIZE, handle_vint, VECSIZE); set_handler(EXCCODE_ADE * VECSIZE, handle_ade, VECSIZE); From aa552254cf0039d83908105a07007f9ea616c119 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:10 +0800 Subject: [PATCH 02/24] LoongArch: Define regular names for BCE/WATCH/HVC/GSPR exceptions Define them according to the ISA manual, in order to enable matching the sub-exceptions for humanization purposes later. Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/loongarch.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 7d8b83dd32e9..b3323ab5b78d 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -1397,7 +1397,7 @@ __BUILD_CSR_OP(tlbidx) #define EXSUBCODE_ADEF 0 /* Fetch Instruction */ #define EXSUBCODE_ADEM 1 /* Access Memory*/ #define EXCCODE_ALE 9 /* Unalign Access */ -#define EXCCODE_OOB 10 /* Out of bounds */ +#define EXCCODE_BCE 10 /* Bounds Check Error */ #define EXCCODE_SYS 11 /* System call */ #define EXCCODE_BP 12 /* Breakpoint */ #define EXCCODE_INE 13 /* Inst. Not Exist */ @@ -1408,11 +1408,13 @@ __BUILD_CSR_OP(tlbidx) #define EXCCODE_FPE 18 /* Floating Point Exception */ #define EXCSUBCODE_FPE 0 /* Floating Point Exception */ #define EXCSUBCODE_VFPE 1 /* Vector Exception */ -#define EXCCODE_WATCH 19 /* Watch address reference */ +#define EXCCODE_WATCH 19 /* WatchPoint Exception */ + #define EXCSUBCODE_WPEF 0 /* ... on Instruction Fetch */ + #define EXCSUBCODE_WPEM 1 /* ... on Memory Accesses */ #define EXCCODE_BTDIS 20 /* Binary Trans. Disabled */ #define EXCCODE_BTE 21 /* Binary Trans. Exception */ -#define EXCCODE_PSI 22 /* Guest Privileged Error */ -#define EXCCODE_HYP 23 /* Hypercall */ +#define EXCCODE_GSPR 22 /* Guest Privileged Error */ +#define EXCCODE_HVC 23 /* Hypercall */ #define EXCCODE_GCM 24 /* Guest CSR modified */ #define EXCSUBCODE_GCSC 0 /* Software caused */ #define EXCSUBCODE_GCHC 1 /* Hardware caused */ From f6a79b6036ef220028db190c7424ce8099c98090 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:10 +0800 Subject: [PATCH 03/24] LoongArch: Print GPRs with ABI names when showing registers Show PC (CSR.ERA) in place of $zero, and also show the syscall restart flag (conveniently stuffed in regs[0]) if non-zero. Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 3fb4709c3b19..57768a6de1cf 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -158,22 +158,32 @@ static void __show_regs(const struct pt_regs *regs) const int field = 2 * sizeof(unsigned long); unsigned int excsubcode; unsigned int exccode; - int i; show_regs_print_info(KERN_DEFAULT); - /* - * Saved main processor registers - */ - for (i = 0; i < 32; ) { - if ((i % 4) == 0) - printk("$%2d :", i); - pr_cont(" %0*lx", field, regs->regs[i]); + /* Print saved GPRs except $zero (substituting with PC/ERA) */ +#define GPR_FIELD(x) field, regs->regs[x] + printk("pc %0*lx ra %0*lx tp %0*lx sp %0*lx\n", + field, regs->csr_era, GPR_FIELD(1), GPR_FIELD(2), GPR_FIELD(3)); + printk("a0 %0*lx a1 %0*lx a2 %0*lx a3 %0*lx\n", + GPR_FIELD(4), GPR_FIELD(5), GPR_FIELD(6), GPR_FIELD(7)); + printk("a4 %0*lx a5 %0*lx a6 %0*lx a7 %0*lx\n", + GPR_FIELD(8), GPR_FIELD(9), GPR_FIELD(10), GPR_FIELD(11)); + printk("t0 %0*lx t1 %0*lx t2 %0*lx t3 %0*lx\n", + GPR_FIELD(12), GPR_FIELD(13), GPR_FIELD(14), GPR_FIELD(15)); + printk("t4 %0*lx t5 %0*lx t6 %0*lx t7 %0*lx\n", + GPR_FIELD(16), GPR_FIELD(17), GPR_FIELD(18), GPR_FIELD(19)); + printk("t8 %0*lx u0 %0*lx s9 %0*lx s0 %0*lx\n", + GPR_FIELD(20), GPR_FIELD(21), GPR_FIELD(22), GPR_FIELD(23)); + printk("s1 %0*lx s2 %0*lx s3 %0*lx s4 %0*lx\n", + GPR_FIELD(24), GPR_FIELD(25), GPR_FIELD(26), GPR_FIELD(27)); + printk("s5 %0*lx s6 %0*lx s7 %0*lx s8 %0*lx\n", + GPR_FIELD(28), GPR_FIELD(29), GPR_FIELD(30), GPR_FIELD(31)); - i++; - if ((i % 4) == 0) - pr_cont("\n"); - } + /* The slot for $zero is reused as the syscall restart flag */ + if (regs->regs[0]) + printk("syscall restart flag: %0*lx\n", GPR_FIELD(0)); +#undef GPR_FIELD /* * Saved csr registers From 863b3795efae6458dbc24f03add50c575cf4fbd2 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:10 +0800 Subject: [PATCH 04/24] LoongArch: Print symbol info for $ra and CSR.ERA only for kernel-mode contexts Otherwise the addresses wouldn't make sense at all. While at it, align the "map keys" to maintain right-alignment with the "estat:" line too; also swap the ERA and ra lines so all CSRs are shown together. Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 57768a6de1cf..ab08548315b8 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -183,16 +183,19 @@ static void __show_regs(const struct pt_regs *regs) /* The slot for $zero is reused as the syscall restart flag */ if (regs->regs[0]) printk("syscall restart flag: %0*lx\n", GPR_FIELD(0)); + + if (user_mode(regs)) { + printk(" ra: %0*lx\n", GPR_FIELD(1)); + printk(" ERA: %0*lx\n", field, regs->csr_era); + } else { + printk(" ra: %0*lx %pS\n", GPR_FIELD(1), (void *) regs->regs[1]); + printk(" ERA: %0*lx %pS\n", field, regs->csr_era, (void *) regs->csr_era); + } #undef GPR_FIELD /* * Saved csr registers */ - printk("era : %0*lx %pS\n", field, regs->csr_era, - (void *) regs->csr_era); - printk("ra : %0*lx %pS\n", field, regs->regs[1], - (void *) regs->regs[1]); - printk("CSR crmd: %08lx ", regs->csr_crmd); printk("CSR prmd: %08lx ", regs->csr_prmd); printk("CSR euen: %08lx ", regs->csr_euen); From 05fa8d4977d727cf41294ea09b4a5085cba4ad54 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:10 +0800 Subject: [PATCH 05/24] LoongArch: Fix format of CSR lines during show_regs() Use uppercase CSR names throughout for consistency with the manual wording, and right-align the keys. The "CSR" part is inferrable from context, hence dropped for more horizontal space. Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index ab08548315b8..434a7717face 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -193,16 +193,12 @@ static void __show_regs(const struct pt_regs *regs) } #undef GPR_FIELD - /* - * Saved csr registers - */ - printk("CSR crmd: %08lx ", regs->csr_crmd); - printk("CSR prmd: %08lx ", regs->csr_prmd); - printk("CSR euen: %08lx ", regs->csr_euen); - printk("CSR ecfg: %08lx ", regs->csr_ecfg); - printk("CSR estat: %08lx ", regs->csr_estat); - - pr_cont("\n"); + /* Print saved important CSRs */ + printk(" CRMD: %08lx\n", regs->csr_crmd); + printk(" PRMD: %08lx\n", regs->csr_prmd); + printk(" EUEN: %08lx\n", regs->csr_euen); + printk(" ECFG: %08lx\n", regs->csr_ecfg); + printk("ESTAT: %08lx\n", regs->csr_estat); exccode = ((regs->csr_estat) & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; excsubcode = ((regs->csr_estat) & CSR_ESTAT_ESUBCODE) >> CSR_ESTAT_ESUBCODE_SHIFT; From efada2afacee7b5e59b3182bc36ddfb0fad2b0e2 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:10 +0800 Subject: [PATCH 06/24] LoongArch: Humanize the CRMD line when showing registers Example output looks like: [ xx.xxxxxx] CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) Some initial machinery for this pretty-printing format has been included in this patch as well. Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 51 ++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 434a7717face..97af82547725 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -3,6 +3,7 @@ * Author: Huacai Chen * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ +#include #include #include #include @@ -153,6 +154,54 @@ static void show_code(unsigned int *pc, bool user) pr_cont("\n"); } +static void print_bool_fragment(const char *key, unsigned long val, bool first) +{ + /* e.g. "+PG", "-DA" */ + pr_cont("%s%c%s", first ? "" : " ", val ? '+' : '-', key); +} + +static void print_plv_fragment(const char *key, int val) +{ + /* e.g. "PLV0", "PPLV3" */ + pr_cont("%s%d", key, val); +} + +static void print_memory_type_fragment(const char *key, unsigned long val) +{ + const char *humanized_type; + + switch (val) { + case 0: + humanized_type = "SUC"; + break; + case 1: + humanized_type = "CC"; + break; + case 2: + humanized_type = "WUC"; + break; + default: + pr_cont(" %s=Reserved(%lu)", key, val); + return; + } + + /* e.g. " DATM=WUC" */ + pr_cont(" %s=%s", key, humanized_type); +} + +static void print_crmd(unsigned long x) +{ + printk(" CRMD: %08lx (", x); + print_plv_fragment("PLV", (int) FIELD_GET(CSR_CRMD_PLV, x)); + print_bool_fragment("IE", FIELD_GET(CSR_CRMD_IE, x), false); + print_bool_fragment("DA", FIELD_GET(CSR_CRMD_DA, x), false); + print_bool_fragment("PG", FIELD_GET(CSR_CRMD_PG, x), false); + print_memory_type_fragment("DACF", FIELD_GET(CSR_CRMD_DACF, x)); + print_memory_type_fragment("DACM", FIELD_GET(CSR_CRMD_DACM, x)); + print_bool_fragment("WE", FIELD_GET(CSR_CRMD_WE, x), false); + pr_cont(")\n"); +} + static void __show_regs(const struct pt_regs *regs) { const int field = 2 * sizeof(unsigned long); @@ -194,7 +243,7 @@ static void __show_regs(const struct pt_regs *regs) #undef GPR_FIELD /* Print saved important CSRs */ - printk(" CRMD: %08lx\n", regs->csr_crmd); + print_crmd(regs->csr_crmd); printk(" PRMD: %08lx\n", regs->csr_prmd); printk(" EUEN: %08lx\n", regs->csr_euen); printk(" ECFG: %08lx\n", regs->csr_ecfg); From ce7f0b18b030e17fb3d8327f1b6f1719165ca51d Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:10 +0800 Subject: [PATCH 07/24] LoongArch: Humanize the PRMD line when showing registers Example output looks like: [ xx.xxxxxx] PRMD: 00000004 (PPLV0 +PIE -PWE) Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 97af82547725..9f092e1cc782 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -202,6 +202,15 @@ static void print_crmd(unsigned long x) pr_cont(")\n"); } +static void print_prmd(unsigned long x) +{ + printk(" PRMD: %08lx (", x); + print_plv_fragment("PPLV", (int) FIELD_GET(CSR_PRMD_PPLV, x)); + print_bool_fragment("PIE", FIELD_GET(CSR_PRMD_PIE, x), false); + print_bool_fragment("PWE", FIELD_GET(CSR_PRMD_PWE, x), false); + pr_cont(")\n"); +} + static void __show_regs(const struct pt_regs *regs) { const int field = 2 * sizeof(unsigned long); @@ -244,7 +253,7 @@ static void __show_regs(const struct pt_regs *regs) /* Print saved important CSRs */ print_crmd(regs->csr_crmd); - printk(" PRMD: %08lx\n", regs->csr_prmd); + print_prmd(regs->csr_prmd); printk(" EUEN: %08lx\n", regs->csr_euen); printk(" ECFG: %08lx\n", regs->csr_ecfg); printk("ESTAT: %08lx\n", regs->csr_estat); From 9718d96c035c594390600168cbca0ae4e0d867f5 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:27 +0800 Subject: [PATCH 08/24] LoongArch: Humanize the EUEN line when showing registers Example output looks like: [ xx.xxxxxx] EUEN: 00000000 (-FPE -SXE -ASXE -BTE) Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 9f092e1cc782..bd6be798f81d 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -211,6 +211,16 @@ static void print_prmd(unsigned long x) pr_cont(")\n"); } +static void print_euen(unsigned long x) +{ + printk(" EUEN: %08lx (", x); + print_bool_fragment("FPE", FIELD_GET(CSR_EUEN_FPEN, x), true); + print_bool_fragment("SXE", FIELD_GET(CSR_EUEN_LSXEN, x), false); + print_bool_fragment("ASXE", FIELD_GET(CSR_EUEN_LASXEN, x), false); + print_bool_fragment("BTE", FIELD_GET(CSR_EUEN_LBTEN, x), false); + pr_cont(")\n"); +} + static void __show_regs(const struct pt_regs *regs) { const int field = 2 * sizeof(unsigned long); @@ -254,7 +264,7 @@ static void __show_regs(const struct pt_regs *regs) /* Print saved important CSRs */ print_crmd(regs->csr_crmd); print_prmd(regs->csr_prmd); - printk(" EUEN: %08lx\n", regs->csr_euen); + print_euen(regs->csr_euen); printk(" ECFG: %08lx\n", regs->csr_ecfg); printk("ESTAT: %08lx\n", regs->csr_estat); From 5e3e784d35c41ca21251f92c243780c791100b02 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:27 +0800 Subject: [PATCH 09/24] LoongArch: Humanize the ECFG line when showing registers Example output looks like: [ xx.xxxxxx] ECFG: 00071c1c (LIE=2-4,10-12 VS=7) Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index bd6be798f81d..6376997f2ae9 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -189,6 +189,12 @@ static void print_memory_type_fragment(const char *key, unsigned long val) pr_cont(" %s=%s", key, humanized_type); } +static void print_intr_fragment(const char *key, unsigned long val) +{ + /* e.g. "LIE=0-1,3,5-7" */ + pr_cont("%s=%*pbl", key, EXCCODE_INT_NUM, &val); +} + static void print_crmd(unsigned long x) { printk(" CRMD: %08lx (", x); @@ -221,6 +227,13 @@ static void print_euen(unsigned long x) pr_cont(")\n"); } +static void print_ecfg(unsigned long x) +{ + printk(" ECFG: %08lx (", x); + print_intr_fragment("LIE", FIELD_GET(CSR_ECFG_IM, x)); + pr_cont(" VS=%d)\n", (int) FIELD_GET(CSR_ECFG_VS, x)); +} + static void __show_regs(const struct pt_regs *regs) { const int field = 2 * sizeof(unsigned long); @@ -265,7 +278,7 @@ static void __show_regs(const struct pt_regs *regs) print_crmd(regs->csr_crmd); print_prmd(regs->csr_prmd); print_euen(regs->csr_euen); - printk(" ECFG: %08lx\n", regs->csr_ecfg); + print_ecfg(regs->csr_ecfg); printk("ESTAT: %08lx\n", regs->csr_estat); exccode = ((regs->csr_estat) & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; From 98b90ede59472de5931b7e4d72c3ff948eaa19a4 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:27 +0800 Subject: [PATCH 10/24] LoongArch: Humanize the ESTAT line when showing registers Example output looks like: [ xx.xxxxxx] ESTAT: 00001000 [INT] (IS=12 ECode=0 EsubCode=0) Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 82 ++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 7 deletions(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 6376997f2ae9..748fe2f9aed0 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -234,11 +234,83 @@ static void print_ecfg(unsigned long x) pr_cont(" VS=%d)\n", (int) FIELD_GET(CSR_ECFG_VS, x)); } +static const char *humanize_exc_name(unsigned int ecode, unsigned int esubcode) +{ + /* + * LoongArch users and developers are probably more familiar with + * those names found in the ISA manual, so we are going to print out + * the latter. This will require some mapping. + */ + switch (ecode) { + case EXCCODE_RSV: return "INT"; + case EXCCODE_TLBL: return "PIL"; + case EXCCODE_TLBS: return "PIS"; + case EXCCODE_TLBI: return "PIF"; + case EXCCODE_TLBM: return "PME"; + case EXCCODE_TLBNR: return "PNR"; + case EXCCODE_TLBNX: return "PNX"; + case EXCCODE_TLBPE: return "PPI"; + case EXCCODE_ADE: + switch (esubcode) { + case EXSUBCODE_ADEF: return "ADEF"; + case EXSUBCODE_ADEM: return "ADEM"; + } + break; + case EXCCODE_ALE: return "ALE"; + case EXCCODE_BCE: return "BCE"; + case EXCCODE_SYS: return "SYS"; + case EXCCODE_BP: return "BRK"; + case EXCCODE_INE: return "INE"; + case EXCCODE_IPE: return "IPE"; + case EXCCODE_FPDIS: return "FPD"; + case EXCCODE_LSXDIS: return "SXD"; + case EXCCODE_LASXDIS: return "ASXD"; + case EXCCODE_FPE: + switch (esubcode) { + case EXCSUBCODE_FPE: return "FPE"; + case EXCSUBCODE_VFPE: return "VFPE"; + } + break; + case EXCCODE_WATCH: + switch (esubcode) { + case EXCSUBCODE_WPEF: return "WPEF"; + case EXCSUBCODE_WPEM: return "WPEM"; + } + break; + case EXCCODE_BTDIS: return "BTD"; + case EXCCODE_BTE: return "BTE"; + case EXCCODE_GSPR: return "GSPR"; + case EXCCODE_HVC: return "HVC"; + case EXCCODE_GCM: + switch (esubcode) { + case EXCSUBCODE_GCSC: return "GCSC"; + case EXCSUBCODE_GCHC: return "GCHC"; + } + break; + /* + * The manual did not mention the EXCCODE_SE case, but print out it + * nevertheless. + */ + case EXCCODE_SE: return "SE"; + } + + return "???"; +} + +static void print_estat(unsigned long x) +{ + unsigned int ecode = FIELD_GET(CSR_ESTAT_EXC, x); + unsigned int esubcode = FIELD_GET(CSR_ESTAT_ESUBCODE, x); + + printk("ESTAT: %08lx [%s] (", x, humanize_exc_name(ecode, esubcode)); + print_intr_fragment("IS", FIELD_GET(CSR_ESTAT_IS, x)); + pr_cont(" ECode=%d EsubCode=%d)\n", (int) ecode, (int) esubcode); +} + static void __show_regs(const struct pt_regs *regs) { const int field = 2 * sizeof(unsigned long); - unsigned int excsubcode; - unsigned int exccode; + unsigned int exccode = FIELD_GET(CSR_ESTAT_EXC, regs->csr_estat); show_regs_print_info(KERN_DEFAULT); @@ -279,11 +351,7 @@ static void __show_regs(const struct pt_regs *regs) print_prmd(regs->csr_prmd); print_euen(regs->csr_euen); print_ecfg(regs->csr_ecfg); - printk("ESTAT: %08lx\n", regs->csr_estat); - - exccode = ((regs->csr_estat) & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; - excsubcode = ((regs->csr_estat) & CSR_ESTAT_ESUBCODE) >> CSR_ESTAT_ESUBCODE_SHIFT; - printk("ExcCode : %x (SubCode %x)\n", exccode, excsubcode); + print_estat(regs->csr_estat); if (exccode >= EXCCODE_TLBL && exccode <= EXCCODE_ALE) printk("BadVA : %0*lx\n", field, regs->csr_badvaddr); From 325a38b511caff53c10393bab1e59cbd37f15ec2 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:27 +0800 Subject: [PATCH 11/24] LoongArch: Tweak the BADV and CPUCFG.PRID lines in show_regs() Use ISA manual names for BADV and CPUCFG.PRID lines in show_regs(), for stylistic consistency with the other lines already touched. While at it, also include current CPU's full name in show_regs() output. It may be more helpful for developers looking at the resulting dumps, because multiple distinct CPU models may share the same PRID. Not having this info available may hide problems only found on some but not all of the models sharing one specific PRID. Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 748fe2f9aed0..569c3a5bb052 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -354,10 +354,10 @@ static void __show_regs(const struct pt_regs *regs) print_estat(regs->csr_estat); if (exccode >= EXCCODE_TLBL && exccode <= EXCCODE_ALE) - printk("BadVA : %0*lx\n", field, regs->csr_badvaddr); + printk(" BADV: %0*lx\n", field, regs->csr_badvaddr); - printk("PrId : %08x (%s)\n", read_cpucfg(LOONGARCH_CPUCFG0), - cpu_family_string()); + printk(" PRID: %08x (%s, %s)\n", read_cpucfg(LOONGARCH_CPUCFG0), + cpu_family_string(), cpu_full_name_string()); } void show_regs(struct pt_regs *regs) From c23e7f01cf621290770069d968ca4c8356a50d00 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 1 May 2023 17:19:27 +0800 Subject: [PATCH 12/24] LoongArch: Relay BCE exceptions to userland as SIGSEGV with si_code=SEGV_BNDERR SEGV_BNDERR was introduced initially for supporting the Intel MPX, but fell into disuse after the MPX support was removed. The LoongArch bounds-checking instructions behave very differently than MPX, but overall the interface is still kind of suitable for conveying the information to userland when bounds-checking assertions trigger, so we wouldn't have to invent more UAPI. Specifically, when the BCE triggers, a SEGV_BNDERR is sent to userland, with si_addr set to the out-of-bounds address or value (in asrt{gt,le}'s case), and one of si_lower or si_upper set to the configured bound depending on the faulting instruction. The other bound is set to either 0 or ULONG_MAX to resemble a range with both lower and upper bounds. Note that it is possible to have si_addr == si_lower in case of a failing asrtgt or {ld,st}gt, because those instructions test for strict greater-than relationship. This should not pose a problem for userland, though, because the faulting PC is available for the application to associate back to the exact instruction for figuring out the expectation. Example exception context generated by a faulting `asrtgt.d t0, t1` (assert t0 > t1 or BCE) with t0=100 and t1=200: > pc 00005555558206a4 ra 00007ffff2d854fc tp 00007ffff2f2f180 sp 00007ffffbf9fb80 > a0 0000000000000002 a1 00007ffffbf9fce8 a2 00007ffffbf9fd00 a3 00007ffff2ed4558 > a4 0000000000000000 a5 00007ffff2f044c8 a6 00007ffffbf9fce0 a7 fffffffffffff000 > t0 0000000000000064 t1 00000000000000c8 t2 00007ffffbfa2d5e t3 00007ffff2f12aa0 > t4 00007ffff2ed6158 t5 00007ffff2ed6158 t6 000000000000002e t7 0000000003d8f538 > t8 0000000000000005 u0 0000000000000000 s9 0000000000000000 s0 00007ffffbf9fce8 > s1 0000000000000002 s2 0000000000000000 s3 00007ffff2f2c038 s4 0000555555820610 > s5 00007ffff2ed5000 s6 0000555555827e38 s7 00007ffffbf9fd00 s8 0000555555827e38 > ra: 00007ffff2d854fc > ERA: 00005555558206a4 > CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) > PRMD: 00000007 (PPLV3 +PIE -PWE) > EUEN: 00000000 (-FPE -SXE -ASXE -BTE) > ECFG: 0007181c (LIE=2-4,11-12 VS=7) > ESTAT: 000a0000 [BCE] (IS= ECode=10 EsubCode=0) > PRID: 0014c010 (Loongson-64bit, Loongson-3A5000) Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/inst.h | 26 +++++++++ arch/loongarch/kernel/genex.S | 1 + arch/loongarch/kernel/traps.c | 92 +++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+) diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h index a04fe755d719..b09887ffcd15 100644 --- a/arch/loongarch/include/asm/inst.h +++ b/arch/loongarch/include/asm/inst.h @@ -121,6 +121,8 @@ enum reg2bstrd_op { }; enum reg3_op { + asrtle_op = 0x02, + asrtgt_op = 0x03, addw_op = 0x20, addd_op = 0x21, subw_op = 0x22, @@ -176,6 +178,30 @@ enum reg3_op { amord_op = 0x70c7, amxorw_op = 0x70c8, amxord_op = 0x70c9, + fldgts_op = 0x70e8, + fldgtd_op = 0x70e9, + fldles_op = 0x70ea, + fldled_op = 0x70eb, + fstgts_op = 0x70ec, + fstgtd_op = 0x70ed, + fstles_op = 0x70ee, + fstled_op = 0x70ef, + ldgtb_op = 0x70f0, + ldgth_op = 0x70f1, + ldgtw_op = 0x70f2, + ldgtd_op = 0x70f3, + ldleb_op = 0x70f4, + ldleh_op = 0x70f5, + ldlew_op = 0x70f6, + ldled_op = 0x70f7, + stgtb_op = 0x70f8, + stgth_op = 0x70f9, + stgtw_op = 0x70fa, + stgtd_op = 0x70fb, + stleb_op = 0x70fc, + stleh_op = 0x70fd, + stlew_op = 0x70fe, + stled_op = 0x70ff, }; enum reg3sa2_op { diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S index 44ff1ff64260..78f066384657 100644 --- a/arch/loongarch/kernel/genex.S +++ b/arch/loongarch/kernel/genex.S @@ -82,6 +82,7 @@ SYM_FUNC_END(except_vec_cex) BUILD_HANDLER ade ade badv BUILD_HANDLER ale ale badv + BUILD_HANDLER bce bce none BUILD_HANDLER bp bp none BUILD_HANDLER fpe fpe fcsr BUILD_HANDLER fpu fpu none diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 569c3a5bb052..8db26e4ca447 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ extern asmlinkage void handle_ade(void); extern asmlinkage void handle_ale(void); +extern asmlinkage void handle_bce(void); extern asmlinkage void handle_sys(void); extern asmlinkage void handle_bp(void); extern asmlinkage void handle_ri(void); @@ -588,6 +590,95 @@ static void bug_handler(struct pt_regs *regs) } } +asmlinkage void noinstr do_bce(struct pt_regs *regs) +{ + bool user = user_mode(regs); + unsigned long era = exception_era(regs); + u64 badv = 0, lower = 0, upper = ULONG_MAX; + union loongarch_instruction insn; + irqentry_state_t state = irqentry_enter(regs); + + if (regs->csr_prmd & CSR_PRMD_PIE) + local_irq_enable(); + + current->thread.trap_nr = read_csr_excode(); + + die_if_kernel("Bounds check error in kernel code", regs); + + /* + * Pull out the address that failed bounds checking, and the lower / + * upper bound, by minimally looking at the faulting instruction word + * and reading from the correct register. + */ + if (__get_inst(&insn.word, (u32 *)era, user)) + goto bad_era; + + switch (insn.reg3_format.opcode) { + case asrtle_op: + if (insn.reg3_format.rd != 0) + break; /* not asrtle */ + badv = regs->regs[insn.reg3_format.rj]; + upper = regs->regs[insn.reg3_format.rk]; + break; + + case asrtgt_op: + if (insn.reg3_format.rd != 0) + break; /* not asrtgt */ + badv = regs->regs[insn.reg3_format.rj]; + lower = regs->regs[insn.reg3_format.rk]; + break; + + case ldleb_op: + case ldleh_op: + case ldlew_op: + case ldled_op: + case stleb_op: + case stleh_op: + case stlew_op: + case stled_op: + case fldles_op: + case fldled_op: + case fstles_op: + case fstled_op: + badv = regs->regs[insn.reg3_format.rj]; + upper = regs->regs[insn.reg3_format.rk]; + break; + + case ldgtb_op: + case ldgth_op: + case ldgtw_op: + case ldgtd_op: + case stgtb_op: + case stgth_op: + case stgtw_op: + case stgtd_op: + case fldgts_op: + case fldgtd_op: + case fstgts_op: + case fstgtd_op: + badv = regs->regs[insn.reg3_format.rj]; + lower = regs->regs[insn.reg3_format.rk]; + break; + } + + force_sig_bnderr((void __user *)badv, (void __user *)lower, (void __user *)upper); + +out: + if (regs->csr_prmd & CSR_PRMD_PIE) + local_irq_disable(); + + irqentry_exit(regs, state); + return; + +bad_era: + /* + * Cannot pull out the instruction word, hence cannot provide more + * info than a regular SIGSEGV in this case. + */ + force_sig(SIGSEGV); + goto out; +} + asmlinkage void noinstr do_bp(struct pt_regs *regs) { bool user = user_mode(regs); @@ -955,6 +1046,7 @@ void __init trap_init(void) set_handler(EXCCODE_ADE * VECSIZE, handle_ade, VECSIZE); set_handler(EXCCODE_ALE * VECSIZE, handle_ale, VECSIZE); + set_handler(EXCCODE_BCE * VECSIZE, handle_bce, VECSIZE); set_handler(EXCCODE_SYS * VECSIZE, handle_sys, VECSIZE); set_handler(EXCCODE_BP * VECSIZE, handle_bp, VECSIZE); set_handler(EXCCODE_INE * VECSIZE, handle_ri, VECSIZE); From 2b3bd32ea3a22ea2d5e591da4ac2c2b1fb17c0e0 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 1 May 2023 17:19:27 +0800 Subject: [PATCH 13/24] LoongArch: Provide kernel fpu functions Provide kernel_fpu_begin()/kernel_fpu_end() to allow the kernel itself to use fpu. They can be used by some other kernel components, e.g., the AMDGPU graphic driver for DCN. Reported-by: WANG Xuerui Tested-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/fpu.h | 3 +++ arch/loongarch/kernel/Makefile | 2 +- arch/loongarch/kernel/kfpu.c | 43 ++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 arch/loongarch/kernel/kfpu.c diff --git a/arch/loongarch/include/asm/fpu.h b/arch/loongarch/include/asm/fpu.h index 358b254d9c1d..192f8e35d912 100644 --- a/arch/loongarch/include/asm/fpu.h +++ b/arch/loongarch/include/asm/fpu.h @@ -21,6 +21,9 @@ struct sigcontext; +extern void kernel_fpu_begin(void); +extern void kernel_fpu_end(void); + extern void _init_fpu(unsigned int); extern void _save_fp(struct loongarch_fpu *); extern void _restore_fp(struct loongarch_fpu *); diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index 78d4e3384305..9a72d91cd104 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -13,7 +13,7 @@ obj-y += head.o cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \ obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_EFI) += efi.o -obj-$(CONFIG_CPU_HAS_FPU) += fpu.o +obj-$(CONFIG_CPU_HAS_FPU) += fpu.o kfpu.o obj-$(CONFIG_ARCH_STRICT_ALIGN) += unaligned.o diff --git a/arch/loongarch/kernel/kfpu.c b/arch/loongarch/kernel/kfpu.c new file mode 100644 index 000000000000..5c46ae8c6cac --- /dev/null +++ b/arch/loongarch/kernel/kfpu.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +#include +#include +#include +#include + +static DEFINE_PER_CPU(bool, in_kernel_fpu); + +void kernel_fpu_begin(void) +{ + preempt_disable(); + + WARN_ON(this_cpu_read(in_kernel_fpu)); + + this_cpu_write(in_kernel_fpu, true); + + if (!is_fpu_owner()) + enable_fpu(); + else + _save_fp(¤t->thread.fpu); + + write_fcsr(LOONGARCH_FCSR0, 0); +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + WARN_ON(!this_cpu_read(in_kernel_fpu)); + + if (!is_fpu_owner()) + disable_fpu(); + else + _restore_fp(¤t->thread.fpu); + + this_cpu_write(in_kernel_fpu, false); + + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end); From 8941e93ca5906fcca87dc74255b58054ec4d7868 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Mon, 1 May 2023 17:19:43 +0800 Subject: [PATCH 14/24] LoongArch: Optimize memory ops (memset/memcpy/memmove) To optimize memset()/memcpy()/memmove() and so on, we use a jump table to dispatch cases for short data lengths; and for long data lengths, we split the destination into head part (first 8 bytes), tail part (last 8 bytes) and middle part. The head part and tail part may be at unaligned addresses, while the middle part is always aligned (the middle part is allowed to overlap the head/tail part). In this way, the first and last 8 bytes may be unaligned accesses, but we can make sure the data in the middle is processed at an aligned destination address. We have tested micro-bench[1] on a Loongson-3C5000 16-core machine (2.2GHz): 1. memset | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 696.191 | 1518.785 | 118.16% | | 8 | 0 | 1 | 696.325 | 1518.937 | 118.14% | | 50 | 0 | 0 | 969.976 | 8053.902 | 730.32% | | 50 | 0 | 1 | 970.034 | 8058.475 | 730.74% | | 300 | 0 | 0 | 5876.612 | 16544.703 | 181.53% | | 300 | 0 | 1 | 5030.849 | 16549.011 | 228.95% | | 1200 | 0 | 0 | 11797.077 | 16752.137 | 42.00% | | 1200 | 0 | 1 | 5687.141 | 16645.233 | 192.68% | | 4000 | 0 | 0 | 15723.27 | 16761.557 | 6.60% | | 4000 | 0 | 1 | 5906.114 | 16732.316 | 183.30% | | 8000 | 0 | 0 | 16751.403 | 16770.002 | 0.11% | | 8000 | 0 | 1 | 5995.449 | 16754.07 | 179.45% | 2. memcpy | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 696.2 | 1670.605 | 139.96% | | 8 | 0 | 1 | 696.325 | 1671.138 | 139.99% | | 50 | 0 | 0 | 969.974 | 8724.999 | 799.51% | | 50 | 0 | 1 | 970.032 | 8730.138 | 799.98% | | 300 | 0 | 0 | 5564.662 | 16272.652 | 192.43% | | 300 | 0 | 1 | 4670.436 | 14972.842 | 220.59% | | 1200 | 0 | 0 | 10740.23 | 16751.728 | 55.97% | | 1200 | 0 | 1 | 5027.741 | 14874.564 | 195.85% | | 4000 | 0 | 0 | 15122.367 | 16737.642 | 10.68% | | 4000 | 0 | 1 | 5536.918 | 14890.397 | 168.93% | | 8000 | 0 | 0 | 16505.453 | 16553.543 | 0.29% | | 8000 | 0 | 1 | 5821.619 | 14841.804 | 154.94% | 3. memmove | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 982.693 | 1670.568 | 70.00% | | 8 | 0 | 1 | 983.023 | 1671.174 | 70.00% | | 50 | 0 | 0 | 1230.87 | 8727.625 | 609.06% | | 50 | 0 | 1 | 1232.515 | 8730.138 | 608.32% | | 300 | 0 | 0 | 6490.375 | 16296.993 | 151.09% | | 300 | 0 | 1 | 4282.687 | 14972.842 | 249.61% | | 1200 | 0 | 0 | 11742.755 | 16752.546 | 42.66% | | 1200 | 0 | 1 | 5039.338 | 14872.951 | 195.14% | | 4000 | 0 | 0 | 15467.786 | 16737.09 | 8.21% | | 4000 | 0 | 1 | 5009.905 | 14890.542 | 197.22% | | 8000 | 0 | 0 | 16489.664 | 16553.273 | 0.39% | | 8000 | 0 | 1 | 5823.786 | 14858.646 | 155.14% | * speed: MB/s * length: byte [1] https://github.com/heiher/mem-bench Signed-off-by: WANG Rui Signed-off-by: Huacai Chen --- arch/loongarch/lib/clear_user.S | 136 +++++++++++++++-- arch/loongarch/lib/copy_user.S | 251 +++++++++++++++++++++++++------- arch/loongarch/lib/memcpy.S | 147 +++++++++++++++---- arch/loongarch/lib/memmove.S | 120 ++++++++------- arch/loongarch/lib/memset.S | 118 +++++++++++---- 5 files changed, 604 insertions(+), 168 deletions(-) diff --git a/arch/loongarch/lib/clear_user.S b/arch/loongarch/lib/clear_user.S index 2dc48e61a2c8..fd1d62b244f2 100644 --- a/arch/loongarch/lib/clear_user.S +++ b/arch/loongarch/lib/clear_user.S @@ -13,7 +13,14 @@ .irp to, 0, 1, 2, 3, 4, 5, 6, 7 .L_fixup_handle_\to\(): - addi.d a0, a1, (\to) * (-8) + sub.d a0, a2, a0 + addi.d a0, a0, (\to) * (-8) + jr ra +.endr + +.irp to, 0, 2, 4 +.L_fixup_handle_s\to\(): + addi.d a0, a1, -\to jr ra .endr @@ -44,7 +51,7 @@ SYM_FUNC_START(__clear_user_generic) 2: move a0, a1 jr ra - _asm_extable 1b, .L_fixup_handle_0 + _asm_extable 1b, .L_fixup_handle_s0 SYM_FUNC_END(__clear_user_generic) /* @@ -54,12 +61,21 @@ SYM_FUNC_END(__clear_user_generic) * a1: size */ SYM_FUNC_START(__clear_user_fast) - beqz a1, 10f + sltui t0, a1, 9 + bnez t0, .Lsmall - ori a2, zero, 64 - blt a1, a2, 9f + add.d a2, a0, a1 +0: st.d zero, a0, 0 + + /* align up address */ + addi.d a0, a0, 8 + bstrins.d a0, zero, 2, 0 + + addi.d a3, a2, -64 + bgeu a0, a3, .Llt64 /* set 64 bytes at a time */ +.Lloop64: 1: st.d zero, a0, 0 2: st.d zero, a0, 8 3: st.d zero, a0, 16 @@ -68,24 +84,95 @@ SYM_FUNC_START(__clear_user_fast) 6: st.d zero, a0, 40 7: st.d zero, a0, 48 8: st.d zero, a0, 56 - addi.d a0, a0, 64 - addi.d a1, a1, -64 - bge a1, a2, 1b - - beqz a1, 10f + bltu a0, a3, .Lloop64 /* set the remaining bytes */ -9: st.b zero, a0, 0 - addi.d a0, a0, 1 - addi.d a1, a1, -1 - bgt a1, zero, 9b +.Llt64: + addi.d a3, a2, -32 + bgeu a0, a3, .Llt32 +9: st.d zero, a0, 0 +10: st.d zero, a0, 8 +11: st.d zero, a0, 16 +12: st.d zero, a0, 24 + addi.d a0, a0, 32 + +.Llt32: + addi.d a3, a2, -16 + bgeu a0, a3, .Llt16 +13: st.d zero, a0, 0 +14: st.d zero, a0, 8 + addi.d a0, a0, 16 + +.Llt16: + addi.d a3, a2, -8 + bgeu a0, a3, .Llt8 +15: st.d zero, a0, 0 + +.Llt8: +16: st.d zero, a2, -8 /* return */ -10: move a0, a1 + move a0, zero + jr ra + + .align 4 +.Lsmall: + pcaddi t0, 4 + slli.d a2, a1, 4 + add.d t0, t0, a2 + jr t0 + + .align 4 + move a0, zero + jr ra + + .align 4 +17: st.b zero, a0, 0 + move a0, zero + jr ra + + .align 4 +18: st.h zero, a0, 0 + move a0, zero + jr ra + + .align 4 +19: st.h zero, a0, 0 +20: st.b zero, a0, 2 + move a0, zero + jr ra + + .align 4 +21: st.w zero, a0, 0 + move a0, zero + jr ra + + .align 4 +22: st.w zero, a0, 0 +23: st.b zero, a0, 4 + move a0, zero + jr ra + + .align 4 +24: st.w zero, a0, 0 +25: st.h zero, a0, 4 + move a0, zero + jr ra + + .align 4 +26: st.w zero, a0, 0 +27: st.w zero, a0, 3 + move a0, zero + jr ra + + .align 4 +28: st.d zero, a0, 0 + move a0, zero jr ra /* fixup and ex_table */ + _asm_extable 0b, .L_fixup_handle_0 _asm_extable 1b, .L_fixup_handle_0 _asm_extable 2b, .L_fixup_handle_1 _asm_extable 3b, .L_fixup_handle_2 @@ -95,4 +182,23 @@ SYM_FUNC_START(__clear_user_fast) _asm_extable 7b, .L_fixup_handle_6 _asm_extable 8b, .L_fixup_handle_7 _asm_extable 9b, .L_fixup_handle_0 + _asm_extable 10b, .L_fixup_handle_1 + _asm_extable 11b, .L_fixup_handle_2 + _asm_extable 12b, .L_fixup_handle_3 + _asm_extable 13b, .L_fixup_handle_0 + _asm_extable 14b, .L_fixup_handle_1 + _asm_extable 15b, .L_fixup_handle_0 + _asm_extable 16b, .L_fixup_handle_1 + _asm_extable 17b, .L_fixup_handle_s0 + _asm_extable 18b, .L_fixup_handle_s0 + _asm_extable 19b, .L_fixup_handle_s0 + _asm_extable 20b, .L_fixup_handle_s2 + _asm_extable 21b, .L_fixup_handle_s0 + _asm_extable 22b, .L_fixup_handle_s0 + _asm_extable 23b, .L_fixup_handle_s4 + _asm_extable 24b, .L_fixup_handle_s0 + _asm_extable 25b, .L_fixup_handle_s4 + _asm_extable 26b, .L_fixup_handle_s0 + _asm_extable 27b, .L_fixup_handle_s4 + _asm_extable 28b, .L_fixup_handle_s0 SYM_FUNC_END(__clear_user_fast) diff --git a/arch/loongarch/lib/copy_user.S b/arch/loongarch/lib/copy_user.S index 55ac6020a1ad..b21f6d5d38f5 100644 --- a/arch/loongarch/lib/copy_user.S +++ b/arch/loongarch/lib/copy_user.S @@ -13,7 +13,14 @@ .irp to, 0, 1, 2, 3, 4, 5, 6, 7 .L_fixup_handle_\to\(): - addi.d a0, a2, (\to) * (-8) + sub.d a0, a2, a0 + addi.d a0, a0, (\to) * (-8) + jr ra +.endr + +.irp to, 0, 2, 4 +.L_fixup_handle_s\to\(): + addi.d a0, a2, -\to jr ra .endr @@ -47,8 +54,8 @@ SYM_FUNC_START(__copy_user_generic) 3: move a0, a2 jr ra - _asm_extable 1b, .L_fixup_handle_0 - _asm_extable 2b, .L_fixup_handle_0 + _asm_extable 1b, .L_fixup_handle_s0 + _asm_extable 2b, .L_fixup_handle_s0 SYM_FUNC_END(__copy_user_generic) /* @@ -59,65 +66,209 @@ SYM_FUNC_END(__copy_user_generic) * a2: n */ SYM_FUNC_START(__copy_user_fast) - beqz a2, 19f + sltui t0, a2, 9 + bnez t0, .Lsmall - ori a3, zero, 64 - blt a2, a3, 17f + add.d a3, a1, a2 + add.d a2, a0, a2 +0: ld.d t0, a1, 0 +1: st.d t0, a0, 0 + + /* align up destination address */ + andi t1, a0, 7 + sub.d t0, zero, t1 + addi.d t0, t0, 8 + add.d a1, a1, t0 + add.d a0, a0, t0 + + addi.d a4, a3, -64 + bgeu a1, a4, .Llt64 /* copy 64 bytes at a time */ -1: ld.d t0, a1, 0 -2: ld.d t1, a1, 8 -3: ld.d t2, a1, 16 -4: ld.d t3, a1, 24 -5: ld.d t4, a1, 32 -6: ld.d t5, a1, 40 -7: ld.d t6, a1, 48 -8: ld.d t7, a1, 56 -9: st.d t0, a0, 0 -10: st.d t1, a0, 8 -11: st.d t2, a0, 16 -12: st.d t3, a0, 24 -13: st.d t4, a0, 32 -14: st.d t5, a0, 40 -15: st.d t6, a0, 48 -16: st.d t7, a0, 56 - - addi.d a0, a0, 64 +.Lloop64: +2: ld.d t0, a1, 0 +3: ld.d t1, a1, 8 +4: ld.d t2, a1, 16 +5: ld.d t3, a1, 24 +6: ld.d t4, a1, 32 +7: ld.d t5, a1, 40 +8: ld.d t6, a1, 48 +9: ld.d t7, a1, 56 addi.d a1, a1, 64 - addi.d a2, a2, -64 - bge a2, a3, 1b - - beqz a2, 19f +10: st.d t0, a0, 0 +11: st.d t1, a0, 8 +12: st.d t2, a0, 16 +13: st.d t3, a0, 24 +14: st.d t4, a0, 32 +15: st.d t5, a0, 40 +16: st.d t6, a0, 48 +17: st.d t7, a0, 56 + addi.d a0, a0, 64 + bltu a1, a4, .Lloop64 /* copy the remaining bytes */ -17: ld.b t0, a1, 0 -18: st.b t0, a0, 0 - addi.d a0, a0, 1 - addi.d a1, a1, 1 - addi.d a2, a2, -1 - bgt a2, zero, 17b +.Llt64: + addi.d a4, a3, -32 + bgeu a1, a4, .Llt32 +18: ld.d t0, a1, 0 +19: ld.d t1, a1, 8 +20: ld.d t2, a1, 16 +21: ld.d t3, a1, 24 + addi.d a1, a1, 32 +22: st.d t0, a0, 0 +23: st.d t1, a0, 8 +24: st.d t2, a0, 16 +25: st.d t3, a0, 24 + addi.d a0, a0, 32 + +.Llt32: + addi.d a4, a3, -16 + bgeu a1, a4, .Llt16 +26: ld.d t0, a1, 0 +27: ld.d t1, a1, 8 + addi.d a1, a1, 16 +28: st.d t0, a0, 0 +29: st.d t1, a0, 8 + addi.d a0, a0, 16 + +.Llt16: + addi.d a4, a3, -8 + bgeu a1, a4, .Llt8 +30: ld.d t0, a1, 0 +31: st.d t0, a0, 0 + +.Llt8: +32: ld.d t0, a3, -8 +33: st.d t0, a2, -8 /* return */ -19: move a0, a2 + move a0, zero + jr ra + + .align 5 +.Lsmall: + pcaddi t0, 8 + slli.d a3, a2, 5 + add.d t0, t0, a3 + jr t0 + + .align 5 + move a0, zero + jr ra + + .align 5 +34: ld.b t0, a1, 0 +35: st.b t0, a0, 0 + move a0, zero + jr ra + + .align 5 +36: ld.h t0, a1, 0 +37: st.h t0, a0, 0 + move a0, zero + jr ra + + .align 5 +38: ld.h t0, a1, 0 +39: ld.b t1, a1, 2 +40: st.h t0, a0, 0 +41: st.b t1, a0, 2 + move a0, zero + jr ra + + .align 5 +42: ld.w t0, a1, 0 +43: st.w t0, a0, 0 + move a0, zero + jr ra + + .align 5 +44: ld.w t0, a1, 0 +45: ld.b t1, a1, 4 +46: st.w t0, a0, 0 +47: st.b t1, a0, 4 + move a0, zero + jr ra + + .align 5 +48: ld.w t0, a1, 0 +49: ld.h t1, a1, 4 +50: st.w t0, a0, 0 +51: st.h t1, a0, 4 + move a0, zero + jr ra + + .align 5 +52: ld.w t0, a1, 0 +53: ld.w t1, a1, 3 +54: st.w t0, a0, 0 +55: st.w t1, a0, 3 + move a0, zero + jr ra + + .align 5 +56: ld.d t0, a1, 0 +57: st.d t0, a0, 0 + move a0, zero jr ra /* fixup and ex_table */ + _asm_extable 0b, .L_fixup_handle_0 _asm_extable 1b, .L_fixup_handle_0 - _asm_extable 2b, .L_fixup_handle_1 - _asm_extable 3b, .L_fixup_handle_2 - _asm_extable 4b, .L_fixup_handle_3 - _asm_extable 5b, .L_fixup_handle_4 - _asm_extable 6b, .L_fixup_handle_5 - _asm_extable 7b, .L_fixup_handle_6 - _asm_extable 8b, .L_fixup_handle_7 + _asm_extable 2b, .L_fixup_handle_0 + _asm_extable 3b, .L_fixup_handle_0 + _asm_extable 4b, .L_fixup_handle_0 + _asm_extable 5b, .L_fixup_handle_0 + _asm_extable 6b, .L_fixup_handle_0 + _asm_extable 7b, .L_fixup_handle_0 + _asm_extable 8b, .L_fixup_handle_0 _asm_extable 9b, .L_fixup_handle_0 - _asm_extable 10b, .L_fixup_handle_1 - _asm_extable 11b, .L_fixup_handle_2 - _asm_extable 12b, .L_fixup_handle_3 - _asm_extable 13b, .L_fixup_handle_4 - _asm_extable 14b, .L_fixup_handle_5 - _asm_extable 15b, .L_fixup_handle_6 - _asm_extable 16b, .L_fixup_handle_7 - _asm_extable 17b, .L_fixup_handle_0 + _asm_extable 10b, .L_fixup_handle_0 + _asm_extable 11b, .L_fixup_handle_1 + _asm_extable 12b, .L_fixup_handle_2 + _asm_extable 13b, .L_fixup_handle_3 + _asm_extable 14b, .L_fixup_handle_4 + _asm_extable 15b, .L_fixup_handle_5 + _asm_extable 16b, .L_fixup_handle_6 + _asm_extable 17b, .L_fixup_handle_7 _asm_extable 18b, .L_fixup_handle_0 + _asm_extable 19b, .L_fixup_handle_0 + _asm_extable 20b, .L_fixup_handle_0 + _asm_extable 21b, .L_fixup_handle_0 + _asm_extable 22b, .L_fixup_handle_0 + _asm_extable 23b, .L_fixup_handle_1 + _asm_extable 24b, .L_fixup_handle_2 + _asm_extable 25b, .L_fixup_handle_3 + _asm_extable 26b, .L_fixup_handle_0 + _asm_extable 27b, .L_fixup_handle_0 + _asm_extable 28b, .L_fixup_handle_0 + _asm_extable 29b, .L_fixup_handle_1 + _asm_extable 30b, .L_fixup_handle_0 + _asm_extable 31b, .L_fixup_handle_0 + _asm_extable 32b, .L_fixup_handle_0 + _asm_extable 33b, .L_fixup_handle_1 + _asm_extable 34b, .L_fixup_handle_s0 + _asm_extable 35b, .L_fixup_handle_s0 + _asm_extable 36b, .L_fixup_handle_s0 + _asm_extable 37b, .L_fixup_handle_s0 + _asm_extable 38b, .L_fixup_handle_s0 + _asm_extable 39b, .L_fixup_handle_s0 + _asm_extable 40b, .L_fixup_handle_s0 + _asm_extable 41b, .L_fixup_handle_s2 + _asm_extable 42b, .L_fixup_handle_s0 + _asm_extable 43b, .L_fixup_handle_s0 + _asm_extable 44b, .L_fixup_handle_s0 + _asm_extable 45b, .L_fixup_handle_s0 + _asm_extable 46b, .L_fixup_handle_s0 + _asm_extable 47b, .L_fixup_handle_s4 + _asm_extable 48b, .L_fixup_handle_s0 + _asm_extable 49b, .L_fixup_handle_s0 + _asm_extable 50b, .L_fixup_handle_s0 + _asm_extable 51b, .L_fixup_handle_s4 + _asm_extable 52b, .L_fixup_handle_s0 + _asm_extable 53b, .L_fixup_handle_s0 + _asm_extable 54b, .L_fixup_handle_s0 + _asm_extable 55b, .L_fixup_handle_s4 + _asm_extable 56b, .L_fixup_handle_s0 + _asm_extable 57b, .L_fixup_handle_s0 SYM_FUNC_END(__copy_user_fast) diff --git a/arch/loongarch/lib/memcpy.S b/arch/loongarch/lib/memcpy.S index 3b7e1dec7109..39ce6621c704 100644 --- a/arch/loongarch/lib/memcpy.S +++ b/arch/loongarch/lib/memcpy.S @@ -44,6 +44,66 @@ SYM_FUNC_START(__memcpy_generic) SYM_FUNC_END(__memcpy_generic) _ASM_NOKPROBE(__memcpy_generic) + .align 5 +SYM_FUNC_START_NOALIGN(__memcpy_small) + pcaddi t0, 8 + slli.d a2, a2, 5 + add.d t0, t0, a2 + jr t0 + + .align 5 +0: jr ra + + .align 5 +1: ld.b t0, a1, 0 + st.b t0, a0, 0 + jr ra + + .align 5 +2: ld.h t0, a1, 0 + st.h t0, a0, 0 + jr ra + + .align 5 +3: ld.h t0, a1, 0 + ld.b t1, a1, 2 + st.h t0, a0, 0 + st.b t1, a0, 2 + jr ra + + .align 5 +4: ld.w t0, a1, 0 + st.w t0, a0, 0 + jr ra + + .align 5 +5: ld.w t0, a1, 0 + ld.b t1, a1, 4 + st.w t0, a0, 0 + st.b t1, a0, 4 + jr ra + + .align 5 +6: ld.w t0, a1, 0 + ld.h t1, a1, 4 + st.w t0, a0, 0 + st.h t1, a0, 4 + jr ra + + .align 5 +7: ld.w t0, a1, 0 + ld.w t1, a1, 3 + st.w t0, a0, 0 + st.w t1, a0, 3 + jr ra + + .align 5 +8: ld.d t0, a1, 0 + st.d t0, a0, 0 + jr ra +SYM_FUNC_END(__memcpy_small) +_ASM_NOKPROBE(__memcpy_small) + /* * void *__memcpy_fast(void *dst, const void *src, size_t n) * @@ -52,14 +112,27 @@ _ASM_NOKPROBE(__memcpy_generic) * a2: n */ SYM_FUNC_START(__memcpy_fast) - move a3, a0 - beqz a2, 3f + sltui t0, a2, 9 + bnez t0, __memcpy_small - ori a4, zero, 64 - blt a2, a4, 2f + add.d a3, a1, a2 + add.d a2, a0, a2 + ld.d a6, a1, 0 + ld.d a7, a3, -8 + + /* align up destination address */ + andi t1, a0, 7 + sub.d t0, zero, t1 + addi.d t0, t0, 8 + add.d a1, a1, t0 + add.d a5, a0, t0 + + addi.d a4, a3, -64 + bgeu a1, a4, .Llt64 /* copy 64 bytes at a time */ -1: ld.d t0, a1, 0 +.Lloop64: + ld.d t0, a1, 0 ld.d t1, a1, 8 ld.d t2, a1, 16 ld.d t3, a1, 24 @@ -67,32 +140,54 @@ SYM_FUNC_START(__memcpy_fast) ld.d t5, a1, 40 ld.d t6, a1, 48 ld.d t7, a1, 56 - st.d t0, a0, 0 - st.d t1, a0, 8 - st.d t2, a0, 16 - st.d t3, a0, 24 - st.d t4, a0, 32 - st.d t5, a0, 40 - st.d t6, a0, 48 - st.d t7, a0, 56 - - addi.d a0, a0, 64 addi.d a1, a1, 64 - addi.d a2, a2, -64 - bge a2, a4, 1b - - beqz a2, 3f + st.d t0, a5, 0 + st.d t1, a5, 8 + st.d t2, a5, 16 + st.d t3, a5, 24 + st.d t4, a5, 32 + st.d t5, a5, 40 + st.d t6, a5, 48 + st.d t7, a5, 56 + addi.d a5, a5, 64 + bltu a1, a4, .Lloop64 /* copy the remaining bytes */ -2: ld.b t0, a1, 0 - st.b t0, a0, 0 - addi.d a0, a0, 1 - addi.d a1, a1, 1 - addi.d a2, a2, -1 - bgt a2, zero, 2b +.Llt64: + addi.d a4, a3, -32 + bgeu a1, a4, .Llt32 + ld.d t0, a1, 0 + ld.d t1, a1, 8 + ld.d t2, a1, 16 + ld.d t3, a1, 24 + addi.d a1, a1, 32 + st.d t0, a5, 0 + st.d t1, a5, 8 + st.d t2, a5, 16 + st.d t3, a5, 24 + addi.d a5, a5, 32 + +.Llt32: + addi.d a4, a3, -16 + bgeu a1, a4, .Llt16 + ld.d t0, a1, 0 + ld.d t1, a1, 8 + addi.d a1, a1, 16 + st.d t0, a5, 0 + st.d t1, a5, 8 + addi.d a5, a5, 16 + +.Llt16: + addi.d a4, a3, -8 + bgeu a1, a4, .Llt8 + ld.d t0, a1, 0 + st.d t0, a5, 0 + +.Llt8: + st.d a6, a0, 0 + st.d a7, a2, -8 /* return */ -3: move a0, a3 jr ra SYM_FUNC_END(__memcpy_fast) _ASM_NOKPROBE(__memcpy_fast) diff --git a/arch/loongarch/lib/memmove.S b/arch/loongarch/lib/memmove.S index b796c3d6da05..45b725ba7867 100644 --- a/arch/loongarch/lib/memmove.S +++ b/arch/loongarch/lib/memmove.S @@ -11,23 +11,9 @@ #include SYM_FUNC_START(memmove) - blt a0, a1, 1f /* dst < src, memcpy */ - blt a1, a0, 3f /* src < dst, rmemcpy */ + blt a0, a1, memcpy /* dst < src, memcpy */ + blt a1, a0, rmemcpy /* src < dst, rmemcpy */ jr ra /* dst == src, return */ - - /* if (src - dst) < 64, copy 1 byte at a time */ -1: ori a3, zero, 64 - sub.d t0, a1, a0 - blt t0, a3, 2f - b memcpy -2: b __memcpy_generic - - /* if (dst - src) < 64, copy 1 byte at a time */ -3: ori a3, zero, 64 - sub.d t0, a0, a1 - blt t0, a3, 4f - b rmemcpy -4: b __rmemcpy_generic SYM_FUNC_END(memmove) _ASM_NOKPROBE(memmove) @@ -76,50 +62,80 @@ _ASM_NOKPROBE(__rmemcpy_generic) * a2: n */ SYM_FUNC_START(__rmemcpy_fast) - move a3, a0 - beqz a2, 3f + sltui t0, a2, 9 + bnez t0, __memcpy_small - add.d a0, a0, a2 - add.d a1, a1, a2 + add.d a3, a1, a2 + add.d a2, a0, a2 + ld.d a6, a1, 0 + ld.d a7, a3, -8 - ori a4, zero, 64 - blt a2, a4, 2f + /* align up destination address */ + andi t1, a2, 7 + sub.d a3, a3, t1 + sub.d a5, a2, t1 + + addi.d a4, a1, 64 + bgeu a4, a3, .Llt64 /* copy 64 bytes at a time */ -1: ld.d t0, a1, -8 - ld.d t1, a1, -16 - ld.d t2, a1, -24 - ld.d t3, a1, -32 - ld.d t4, a1, -40 - ld.d t5, a1, -48 - ld.d t6, a1, -56 - ld.d t7, a1, -64 - st.d t0, a0, -8 - st.d t1, a0, -16 - st.d t2, a0, -24 - st.d t3, a0, -32 - st.d t4, a0, -40 - st.d t5, a0, -48 - st.d t6, a0, -56 - st.d t7, a0, -64 - - addi.d a0, a0, -64 - addi.d a1, a1, -64 - addi.d a2, a2, -64 - bge a2, a4, 1b - - beqz a2, 3f +.Lloop64: + ld.d t0, a3, -8 + ld.d t1, a3, -16 + ld.d t2, a3, -24 + ld.d t3, a3, -32 + ld.d t4, a3, -40 + ld.d t5, a3, -48 + ld.d t6, a3, -56 + ld.d t7, a3, -64 + addi.d a3, a3, -64 + st.d t0, a5, -8 + st.d t1, a5, -16 + st.d t2, a5, -24 + st.d t3, a5, -32 + st.d t4, a5, -40 + st.d t5, a5, -48 + st.d t6, a5, -56 + st.d t7, a5, -64 + addi.d a5, a5, -64 + bltu a4, a3, .Lloop64 /* copy the remaining bytes */ -2: ld.b t0, a1, -1 - st.b t0, a0, -1 - addi.d a0, a0, -1 - addi.d a1, a1, -1 - addi.d a2, a2, -1 - bgt a2, zero, 2b +.Llt64: + addi.d a4, a1, 32 + bgeu a4, a3, .Llt32 + ld.d t0, a3, -8 + ld.d t1, a3, -16 + ld.d t2, a3, -24 + ld.d t3, a3, -32 + addi.d a3, a3, -32 + st.d t0, a5, -8 + st.d t1, a5, -16 + st.d t2, a5, -24 + st.d t3, a5, -32 + addi.d a5, a5, -32 + +.Llt32: + addi.d a4, a1, 16 + bgeu a4, a3, .Llt16 + ld.d t0, a3, -8 + ld.d t1, a3, -16 + addi.d a3, a3, -16 + st.d t0, a5, -8 + st.d t1, a5, -16 + addi.d a5, a5, -16 + +.Llt16: + addi.d a4, a1, 8 + bgeu a4, a3, .Llt8 + ld.d t0, a3, -8 + st.d t0, a5, -8 + +.Llt8: + st.d a6, a0, 0 + st.d a7, a2, -8 /* return */ -3: move a0, a3 jr ra SYM_FUNC_END(__rmemcpy_fast) _ASM_NOKPROBE(__rmemcpy_fast) diff --git a/arch/loongarch/lib/memset.S b/arch/loongarch/lib/memset.S index a9eb732ab2ad..b39c6194e3ae 100644 --- a/arch/loongarch/lib/memset.S +++ b/arch/loongarch/lib/memset.S @@ -56,39 +56,107 @@ _ASM_NOKPROBE(__memset_generic) * a2: n */ SYM_FUNC_START(__memset_fast) - move a3, a0 - beqz a2, 3f - - ori a4, zero, 64 - blt a2, a4, 2f - /* fill a1 to 64 bits */ fill_to_64 a1 + sltui t0, a2, 9 + bnez t0, .Lsmall + + add.d a2, a0, a2 + st.d a1, a0, 0 + + /* align up address */ + addi.d a3, a0, 8 + bstrins.d a3, zero, 2, 0 + + addi.d a4, a2, -64 + bgeu a3, a4, .Llt64 + /* set 64 bytes at a time */ -1: st.d a1, a0, 0 - st.d a1, a0, 8 - st.d a1, a0, 16 - st.d a1, a0, 24 - st.d a1, a0, 32 - st.d a1, a0, 40 - st.d a1, a0, 48 - st.d a1, a0, 56 - - addi.d a0, a0, 64 - addi.d a2, a2, -64 - bge a2, a4, 1b - - beqz a2, 3f +.Lloop64: + st.d a1, a3, 0 + st.d a1, a3, 8 + st.d a1, a3, 16 + st.d a1, a3, 24 + st.d a1, a3, 32 + st.d a1, a3, 40 + st.d a1, a3, 48 + st.d a1, a3, 56 + addi.d a3, a3, 64 + bltu a3, a4, .Lloop64 /* set the remaining bytes */ -2: st.b a1, a0, 0 - addi.d a0, a0, 1 - addi.d a2, a2, -1 - bgt a2, zero, 2b +.Llt64: + addi.d a4, a2, -32 + bgeu a3, a4, .Llt32 + st.d a1, a3, 0 + st.d a1, a3, 8 + st.d a1, a3, 16 + st.d a1, a3, 24 + addi.d a3, a3, 32 + +.Llt32: + addi.d a4, a2, -16 + bgeu a3, a4, .Llt16 + st.d a1, a3, 0 + st.d a1, a3, 8 + addi.d a3, a3, 16 + +.Llt16: + addi.d a4, a2, -8 + bgeu a3, a4, .Llt8 + st.d a1, a3, 0 + +.Llt8: + st.d a1, a2, -8 /* return */ -3: move a0, a3 + jr ra + + .align 4 +.Lsmall: + pcaddi t0, 4 + slli.d a2, a2, 4 + add.d t0, t0, a2 + jr t0 + + .align 4 +0: jr ra + + .align 4 +1: st.b a1, a0, 0 + jr ra + + .align 4 +2: st.h a1, a0, 0 + jr ra + + .align 4 +3: st.h a1, a0, 0 + st.b a1, a0, 2 + jr ra + + .align 4 +4: st.w a1, a0, 0 + jr ra + + .align 4 +5: st.w a1, a0, 0 + st.b a1, a0, 4 + jr ra + + .align 4 +6: st.w a1, a0, 0 + st.h a1, a0, 4 + jr ra + + .align 4 +7: st.w a1, a0, 0 + st.w a1, a0, 3 + jr ra + + .align 4 +8: st.d a1, a0, 0 jr ra SYM_FUNC_END(__memset_fast) _ASM_NOKPROBE(__memset_fast) From 69e3a6aa6be21de6aaf38130fad97ecde34a193c Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 1 May 2023 17:19:43 +0800 Subject: [PATCH 15/24] LoongArch: Add checksum optimization for 64-bit system LoongArch platform is 64-bit system, which supports 8-bytes memory accessing, but generic checksum functions use 4-byte memory access. So add 8-bytes memory access optimization for checksum functions on LoongArch. And the code comes from arm64 system. When network hw checksum is disabled, iperf performance improves about 10% with this patch. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/checksum.h | 66 ++++++++++++ arch/loongarch/lib/Makefile | 2 +- arch/loongarch/lib/csum.c | 141 ++++++++++++++++++++++++++ 3 files changed, 208 insertions(+), 1 deletion(-) create mode 100644 arch/loongarch/include/asm/checksum.h create mode 100644 arch/loongarch/lib/csum.c diff --git a/arch/loongarch/include/asm/checksum.h b/arch/loongarch/include/asm/checksum.h new file mode 100644 index 000000000000..cabbf6af44c4 --- /dev/null +++ b/arch/loongarch/include/asm/checksum.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2016 ARM Ltd. + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ +#ifndef __ASM_CHECKSUM_H +#define __ASM_CHECKSUM_H + +#include +#include + +#define _HAVE_ARCH_IPV6_CSUM +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __wsum sum); + +/* + * turns a 32-bit partial checksum (e.g. from csum_partial) into a + * 1's complement 16-bit checksum. + */ +static inline __sum16 csum_fold(__wsum sum) +{ + u32 tmp = (__force u32)sum; + + /* + * swap the two 16-bit halves of sum + * if there is a carry from adding the two 16-bit halves, + * it will carry from the lower half into the upper half, + * giving us the correct sum in the upper half. + */ + return (__force __sum16)(~(tmp + rol32(tmp, 16)) >> 16); +} +#define csum_fold csum_fold + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. ihl is the number + * of 32-bit words and is always >= 5. + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + u64 sum; + __uint128_t tmp; + int n = ihl; /* we want it signed */ + + tmp = *(const __uint128_t *)iph; + iph += 16; + n -= 4; + tmp += ((tmp >> 64) | (tmp << 64)); + sum = tmp >> 64; + do { + sum += *(const u32 *)iph; + iph += 4; + } while (--n > 0); + + sum += ror64(sum, 32); + return csum_fold((__force __wsum)(sum >> 32)); +} +#define ip_fast_csum ip_fast_csum + +extern unsigned int do_csum(const unsigned char *buff, int len); +#define do_csum do_csum + +#include + +#endif /* __ASM_CHECKSUM_H */ diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile index 40bde632900f..271ed45f51bb 100644 --- a/arch/loongarch/lib/Makefile +++ b/arch/loongarch/lib/Makefile @@ -4,4 +4,4 @@ # lib-y += delay.o memset.o memcpy.o memmove.o \ - clear_user.o copy_user.o dump_tlb.o unaligned.o + clear_user.o copy_user.o csum.o dump_tlb.o unaligned.o diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c new file mode 100644 index 000000000000..a5e84b403c3b --- /dev/null +++ b/arch/loongarch/lib/csum.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (C) 2019-2020 Arm Ltd. + +#include +#include +#include + +#include + +static u64 accumulate(u64 sum, u64 data) +{ + sum += data; + if (sum < data) + sum += 1; + return sum; +} + +/* + * We over-read the buffer and this makes KASAN unhappy. Instead, disable + * instrumentation and call kasan explicitly. + */ +unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len) +{ + unsigned int offset, shift, sum; + const u64 *ptr; + u64 data, sum64 = 0; + + if (unlikely(len == 0)) + return 0; + + offset = (unsigned long)buff & 7; + /* + * This is to all intents and purposes safe, since rounding down cannot + * result in a different page or cache line being accessed, and @buff + * should absolutely not be pointing to anything read-sensitive. We do, + * however, have to be careful not to piss off KASAN, which means using + * unchecked reads to accommodate the head and tail, for which we'll + * compensate with an explicit check up-front. + */ + kasan_check_read(buff, len); + ptr = (u64 *)(buff - offset); + len = len + offset - 8; + + /* + * Head: zero out any excess leading bytes. Shifting back by the same + * amount should be at least as fast as any other way of handling the + * odd/even alignment, and means we can ignore it until the very end. + */ + shift = offset * 8; + data = *ptr++; + data = (data >> shift) << shift; + + /* + * Body: straightforward aligned loads from here on (the paired loads + * underlying the quadword type still only need dword alignment). The + * main loop strictly excludes the tail, so the second loop will always + * run at least once. + */ + while (unlikely(len > 64)) { + __uint128_t tmp1, tmp2, tmp3, tmp4; + + tmp1 = *(__uint128_t *)ptr; + tmp2 = *(__uint128_t *)(ptr + 2); + tmp3 = *(__uint128_t *)(ptr + 4); + tmp4 = *(__uint128_t *)(ptr + 6); + + len -= 64; + ptr += 8; + + /* This is the "don't dump the carry flag into a GPR" idiom */ + tmp1 += (tmp1 >> 64) | (tmp1 << 64); + tmp2 += (tmp2 >> 64) | (tmp2 << 64); + tmp3 += (tmp3 >> 64) | (tmp3 << 64); + tmp4 += (tmp4 >> 64) | (tmp4 << 64); + tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64); + tmp1 += (tmp1 >> 64) | (tmp1 << 64); + tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64); + tmp3 += (tmp3 >> 64) | (tmp3 << 64); + tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64); + tmp1 += (tmp1 >> 64) | (tmp1 << 64); + tmp1 = ((tmp1 >> 64) << 64) | sum64; + tmp1 += (tmp1 >> 64) | (tmp1 << 64); + sum64 = tmp1 >> 64; + } + while (len > 8) { + __uint128_t tmp; + + sum64 = accumulate(sum64, data); + tmp = *(__uint128_t *)ptr; + + len -= 16; + ptr += 2; + + data = tmp >> 64; + sum64 = accumulate(sum64, tmp); + } + if (len > 0) { + sum64 = accumulate(sum64, data); + data = *ptr; + len -= 8; + } + /* + * Tail: zero any over-read bytes similarly to the head, again + * preserving odd/even alignment. + */ + shift = len * -8; + data = (data << shift) >> shift; + sum64 = accumulate(sum64, data); + + /* Finally, folding */ + sum64 += (sum64 >> 32) | (sum64 << 32); + sum = sum64 >> 32; + sum += (sum >> 16) | (sum << 16); + if (offset & 1) + return (u16)swab32(sum); + + return sum >> 16; +} + +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __wsum csum) +{ + __uint128_t src, dst; + u64 sum = (__force u64)csum; + + src = *(const __uint128_t *)saddr->s6_addr; + dst = *(const __uint128_t *)daddr->s6_addr; + + sum += (__force u32)htonl(len); + sum += (u32)proto << 24; + src += (src >> 64) | (src << 64); + dst += (dst >> 64) | (dst << 64); + + sum = accumulate(sum, src >> 64); + sum = accumulate(sum, dst >> 64); + + sum += ((sum >> 32) | (sum << 32)); + return csum_fold((__force __wsum)(sum >> 32)); +} +EXPORT_SYMBOL(csum_ipv6_magic); From 2f1648220214d18168e55920c21014e71c2d5bbc Mon Sep 17 00:00:00 2001 From: Min Zhou Date: Mon, 1 May 2023 17:19:43 +0800 Subject: [PATCH 16/24] LoongArch: crypto: Add crc32 and crc32c hw acceleration With a blatant copy of some MIPS bits we introduce the crc32 and crc32c hw accelerated module to LoongArch. LoongArch has provided these instructions to calculate crc32 and crc32c: * crc.w.b.w crcc.w.b.w * crc.w.h.w crcc.w.h.w * crc.w.w.w crcc.w.w.w * crc.w.d.w crcc.w.d.w So we can make use of these instructions to improve the performance of calculation for crc32(c) checksums. As can be seen from the following test results, crc32(c) instructions can improve the performance by 58%. Software implemention Hardware acceleration Buffer size time cost (seconds) time cost (seconds) Accel. 100 KB 0.000845 0.000534 59.1% 1 MB 0.007758 0.004836 59.4% 10 MB 0.076593 0.047682 59.4% 100 MB 0.756734 0.479126 58.5% 1000 MB 7.563841 4.778266 58.5% Signed-off-by: Min Zhou Signed-off-by: Huacai Chen --- arch/loongarch/Makefile | 2 + arch/loongarch/crypto/Kconfig | 14 ++ arch/loongarch/crypto/Makefile | 6 + arch/loongarch/crypto/crc32-loongarch.c | 304 ++++++++++++++++++++++++ crypto/Kconfig | 3 + 5 files changed, 329 insertions(+) create mode 100644 arch/loongarch/crypto/Kconfig create mode 100644 arch/loongarch/crypto/Makefile create mode 100644 arch/loongarch/crypto/crc32-loongarch.c diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index f71edf574101..a27e264bdaa5 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -115,6 +115,8 @@ endif libs-y += arch/loongarch/lib/ libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a +drivers-y += arch/loongarch/crypto/ + # suspend and hibernation support drivers-$(CONFIG_PM) += arch/loongarch/power/ diff --git a/arch/loongarch/crypto/Kconfig b/arch/loongarch/crypto/Kconfig new file mode 100644 index 000000000000..200a6e8b43b1 --- /dev/null +++ b/arch/loongarch/crypto/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "Accelerated Cryptographic Algorithms for CPU (loongarch)" + +config CRYPTO_CRC32_LOONGARCH + tristate "CRC32c and CRC32" + select CRC32 + select CRYPTO_HASH + help + CRC32c and CRC32 CRC algorithms + + Architecture: LoongArch with CRC32 instructions + +endmenu diff --git a/arch/loongarch/crypto/Makefile b/arch/loongarch/crypto/Makefile new file mode 100644 index 000000000000..d22613d27ce9 --- /dev/null +++ b/arch/loongarch/crypto/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for LoongArch crypto files.. +# + +obj-$(CONFIG_CRYPTO_CRC32_LOONGARCH) += crc32-loongarch.o diff --git a/arch/loongarch/crypto/crc32-loongarch.c b/arch/loongarch/crypto/crc32-loongarch.c new file mode 100644 index 000000000000..1f2a2c3839bc --- /dev/null +++ b/arch/loongarch/crypto/crc32-loongarch.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * crc32.c - CRC32 and CRC32C using LoongArch crc* instructions + * + * Module based on mips/crypto/crc32-mips.c + * + * Copyright (C) 2014 Linaro Ltd + * Copyright (C) 2018 MIPS Tech, LLC + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include +#include + +#include +#include + +#define _CRC32(crc, value, size, type) \ +do { \ + __asm__ __volatile__( \ + #type ".w." #size ".w" " %0, %1, %0\n\t"\ + : "+r" (crc) \ + : "r" (value) \ + : "memory"); \ +} while (0) + +#define CRC32(crc, value, size) _CRC32(crc, value, size, crc) +#define CRC32C(crc, value, size) _CRC32(crc, value, size, crcc) + +static u32 crc32_loongarch_hw(u32 crc_, const u8 *p, unsigned int len) +{ + u32 crc = crc_; + + while (len >= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32(crc, value, d); + p += sizeof(u64); + len -= sizeof(u64); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32(crc, value, w); + p += sizeof(u32); + len -= sizeof(u32); + } + + if (len & sizeof(u16)) { + u16 value = get_unaligned_le16(p); + + CRC32(crc, value, h); + p += sizeof(u16); + } + + if (len & sizeof(u8)) { + u8 value = *p++; + + CRC32(crc, value, b); + } + + return crc; +} + +static u32 crc32c_loongarch_hw(u32 crc_, const u8 *p, unsigned int len) +{ + u32 crc = crc_; + + while (len >= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32C(crc, value, d); + p += sizeof(u64); + len -= sizeof(u64); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32C(crc, value, w); + p += sizeof(u32); + len -= sizeof(u32); + } + + if (len & sizeof(u16)) { + u16 value = get_unaligned_le16(p); + + CRC32C(crc, value, h); + p += sizeof(u16); + } + + if (len & sizeof(u8)) { + u8 value = *p++; + + CRC32C(crc, value, b); + } + + return crc; +} + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +struct chksum_ctx { + u32 key; +}; + +struct chksum_desc_ctx { + u32 crc; +}; + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = mctx->key; + + return 0; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set the seed. + */ +static int chksum_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) +{ + struct chksum_ctx *mctx = crypto_shash_ctx(tfm); + + if (keylen != sizeof(mctx->key)) + return -EINVAL; + + mctx->key = get_unaligned_le32(key); + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = crc32_loongarch_hw(ctx->crc, data, length); + return 0; +} + +static int chksumc_update(struct shash_desc *desc, const u8 *data, unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = crc32c_loongarch_hw(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + put_unaligned_le32(ctx->crc, out); + return 0; +} + +static int chksumc_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + put_unaligned_le32(~ctx->crc, out); + return 0; +} + +static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out) +{ + put_unaligned_le32(crc32_loongarch_hw(crc, data, len), out); + return 0; +} + +static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out) +{ + put_unaligned_le32(~crc32c_loongarch_hw(crc, data, len), out); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(ctx->crc, data, len, out); +} + +static int chksumc_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksumc_finup(ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out) +{ + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); + + return __chksum_finup(mctx->key, data, length, out); +} + +static int chksumc_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out) +{ + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); + + return __chksumc_finup(mctx->key, data, length, out); +} + +static int chksum_cra_init(struct crypto_tfm *tfm) +{ + struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); + + mctx->key = 0; + return 0; +} + +static int chksumc_cra_init(struct crypto_tfm *tfm) +{ + struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); + + mctx->key = ~0; + return 0; +} + +static struct shash_alg crc32_alg = { + .digestsize = CHKSUM_DIGEST_SIZE, + .setkey = chksum_setkey, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crc32", + .cra_driver_name = "crc32-loongarch", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_alignmask = 0, + .cra_ctxsize = sizeof(struct chksum_ctx), + .cra_module = THIS_MODULE, + .cra_init = chksum_cra_init, + } +}; + +static struct shash_alg crc32c_alg = { + .digestsize = CHKSUM_DIGEST_SIZE, + .setkey = chksum_setkey, + .init = chksum_init, + .update = chksumc_update, + .final = chksumc_final, + .finup = chksumc_finup, + .digest = chksumc_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-loongarch", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_alignmask = 0, + .cra_ctxsize = sizeof(struct chksum_ctx), + .cra_module = THIS_MODULE, + .cra_init = chksumc_cra_init, + } +}; + +static int __init crc32_mod_init(void) +{ + int err; + + if (!cpu_has(CPU_FEATURE_CRC32)) + return 0; + + err = crypto_register_shash(&crc32_alg); + if (err) + return err; + + err = crypto_register_shash(&crc32c_alg); + if (err) + return err; + + return 0; +} + +static void __exit crc32_mod_exit(void) +{ + if (!cpu_has(CPU_FEATURE_CRC32)) + return; + + crypto_unregister_shash(&crc32_alg); + crypto_unregister_shash(&crc32c_alg); +} + +module_init(crc32_mod_init); +module_exit(crc32_mod_exit); + +MODULE_AUTHOR("Min Zhou "); +MODULE_AUTHOR("Huacai Chen "); +MODULE_DESCRIPTION("CRC32 and CRC32C using LoongArch crc* instructions"); +MODULE_LICENSE("GPL v2"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 9c86f7045157..a0e080d5f6ae 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1395,6 +1395,9 @@ endif if ARM64 source "arch/arm64/crypto/Kconfig" endif +if LOONGARCH +source "arch/loongarch/crypto/Kconfig" +endif if MIPS source "arch/mips/crypto/Kconfig" endif From d4c937c2a57bbba24790be6fe7a791456f5fbb60 Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Mon, 1 May 2023 17:19:52 +0800 Subject: [PATCH 17/24] LoongArch: Add ARCH_HAS_FORTIFY_SOURCE selection FORTIFY_SOURCE could detect various overflows at compile and run time. ARCH_HAS_FORTIFY_SOURCE means that the architecture can be built and run with CONFIG_FORTIFY_SOURCE. So select it in LoongArch. See more about this feature from commit 6974f0c4555e285 ("include/linux/ string.h: add the option of fortified string.h functions"). Signed-off-by: Qing Zhang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 3ddde336e6a5..3e5d6acbf240 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -10,6 +10,7 @@ config LOONGARCH select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST From 8b5ee2c66d5c4c1312fd193d4138e6963160ba43 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 1 May 2023 17:19:52 +0800 Subject: [PATCH 18/24] LoongArch: Add support for function error injection Inspired by the commit 42d038c4fb00f ("arm64: Add support for function error injection") and the commit ee55ff803b383 ("riscv: Add support for function error injection"), this patch supports function error injection for LoongArch. Mainly implement two functions: (1) regs_set_return_value() which is used to overwrite the return value, (2) override_function_with_return() which is used to override the probed function returning and jump to its caller. Here is a simple test under CONFIG_FUNCTION_ERROR_INJECTION and CONFIG_FAIL_FUNCTION: # echo sys_clone > /sys/kernel/debug/fail_function/inject # echo 100 > /sys/kernel/debug/fail_function/probability # dmesg bash: fork: Invalid argument # dmesg ... FAULT_INJECTION: forcing a failure. name fail_function, interval 1, probability 100, space 0, times 1 ... Call Trace: [<90000000002238f4>] show_stack+0x5c/0x180 [<90000000012e384c>] dump_stack_lvl+0x60/0x88 [<9000000000b1879c>] should_fail_ex+0x1b0/0x1f4 [<900000000032ead4>] fei_kprobe_handler+0x28/0x6c [<9000000000230970>] kprobe_breakpoint_handler+0xf0/0x118 [<90000000012e3e60>] do_bp+0x2c4/0x358 [<9000000002241924>] exception_handlers+0x1924/0x10000 [<900000000023b7d0>] sys_clone+0x0/0x4 [<90000000012e4744>] do_syscall+0x7c/0x94 [<9000000000221e44>] handle_syscall+0xc4/0x160 Tested-by: Hengqi Chen Acked-by: Masami Hiramatsu (Google) Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/ptrace.h | 5 +++++ arch/loongarch/lib/Makefile | 2 ++ arch/loongarch/lib/error-inject.c | 10 ++++++++++ 4 files changed, 18 insertions(+) create mode 100644 arch/loongarch/lib/error-inject.c diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 3e5d6acbf240..629a6dac33e1 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -100,6 +100,7 @@ config LOONGARCH select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GENERIC_VDSO diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h index d761db943335..35f0958163ac 100644 --- a/arch/loongarch/include/asm/ptrace.h +++ b/arch/loongarch/include/asm/ptrace.h @@ -154,6 +154,11 @@ static inline long regs_return_value(struct pt_regs *regs) return regs->regs[4]; } +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long val) +{ + regs->regs[4] = val; +} + #define instruction_pointer(regs) ((regs)->csr_era) #define profile_pc(regs) instruction_pointer(regs) diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile index 271ed45f51bb..d60d4e096cfa 100644 --- a/arch/loongarch/lib/Makefile +++ b/arch/loongarch/lib/Makefile @@ -5,3 +5,5 @@ lib-y += delay.o memset.o memcpy.o memmove.o \ clear_user.o copy_user.o csum.o dump_tlb.o unaligned.o + +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/loongarch/lib/error-inject.c b/arch/loongarch/lib/error-inject.c new file mode 100644 index 000000000000..afc9e1c7c973 --- /dev/null +++ b/arch/loongarch/lib/error-inject.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +void override_function_with_return(struct pt_regs *regs) +{ + instruction_pointer_set(regs, regs->regs[1]); +} +NOKPROBE_SYMBOL(override_function_with_return); From 6fbff14a638293dcd0550cac2921f308ef6abe0e Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Mon, 1 May 2023 17:19:52 +0800 Subject: [PATCH 19/24] LoongArch: ftrace: Abstract DYNAMIC_FTRACE_WITH_ARGS accesses Add new ftrace_regs_{get,set}_*() helpers which can be used to manipulate ftrace_regs. When CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS=y, these can always be used on any ftrace_regs, and when CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS =n these can be used when regs are available. Signed-off-by: Qing Zhang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/ftrace.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h index 3418d32d4fc7..aa5c57b5f333 100644 --- a/arch/loongarch/include/asm/ftrace.h +++ b/arch/loongarch/include/asm/ftrace.h @@ -54,6 +54,31 @@ static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs * return &fregs->regs; } +static __always_inline unsigned long +ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs) +{ + return instruction_pointer(&fregs->regs); +} + +static __always_inline void +ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip) +{ + regs_set_return_value(&fregs->regs, ip); +} + +#define ftrace_regs_get_argument(fregs, n) \ + regs_get_kernel_argument(&(fregs)->regs, n) +#define ftrace_regs_get_stack_pointer(fregs) \ + kernel_stack_pointer(&(fregs)->regs) +#define ftrace_regs_return_value(fregs) \ + regs_return_value(&(fregs)->regs) +#define ftrace_regs_set_return_value(fregs, ret) \ + regs_set_return_value(&(fregs)->regs, ret) +#define ftrace_override_function_with_return(fregs) \ + override_function_with_return(&(fregs)->regs) +#define ftrace_regs_query_register_offset(name) \ + regs_query_register_offset(name) + #define ftrace_graph_func ftrace_graph_func void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); From 819cf65575230bca02ad669daadba2ac04187ae1 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Mon, 1 May 2023 17:19:52 +0800 Subject: [PATCH 20/24] LoongArch: ftrace: Fix build error if DYNAMIC_FTRACE_WITH_REGS is not set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can see the following build error if CONFIG_DYNAMIC_FTRACE_WITH_REGS is not set on LoongArch: arch/loongarch/kernel/ftrace_dyn.c: In function ‘ftrace_make_call’: arch/loongarch/kernel/ftrace_dyn.c:167:23: error: implicit declaration of function ‘__get_mod’ 167 | ret = __get_mod(&mod, pc); | ^~~~~~~~~ arch/loongarch/kernel/ftrace_dyn.c:171:24: error: implicit declaration of function ‘get_plt_addr’ 171 | addr = get_plt_addr(mod, addr); | ^~~~~~~~~~~~ The reason is that the __get_mod() and get_plt_addr() may be called in ftrace_make_{call,nop}. Signed-off-by: Youling Tang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/ftrace_dyn.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c index 4a3ef8516ccc..c5f4b4681ddc 100644 --- a/arch/loongarch/kernel/ftrace_dyn.c +++ b/arch/loongarch/kernel/ftrace_dyn.c @@ -30,8 +30,6 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, bool validate) return 0; } -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - #ifdef CONFIG_MODULES static inline int __get_mod(struct module **mod, unsigned long addr) { @@ -72,6 +70,7 @@ static unsigned long get_plt_addr(struct module *mod, unsigned long addr) } #endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { u32 old, new; @@ -102,7 +101,6 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned return ftrace_modify_code(pc, old, new, true); } - #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ int ftrace_update_ftrace_func(ftrace_func_t func) From 24d4f52791dae16e478741cc397ba660a95b9d02 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Mon, 1 May 2023 17:19:53 +0800 Subject: [PATCH 21/24] LoongArch: ftrace: Implement ftrace_find_callable_addr() to simplify code In the module processing functions, the same logic can be reused by implementing ftrace_find_callable_addr(). Signed-off-by: Youling Tang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/ftrace_dyn.c | 118 ++++++++++++++--------------- 1 file changed, 58 insertions(+), 60 deletions(-) diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c index c5f4b4681ddc..eb624af93617 100644 --- a/arch/loongarch/kernel/ftrace_dyn.c +++ b/arch/loongarch/kernel/ftrace_dyn.c @@ -31,16 +31,11 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, bool validate) } #ifdef CONFIG_MODULES -static inline int __get_mod(struct module **mod, unsigned long addr) +static bool reachable_by_bl(unsigned long addr, unsigned long pc) { - preempt_disable(); - *mod = __module_text_address(addr); - preempt_enable(); + long offset = (long)addr - (long)pc; - if (WARN_ON(!(*mod))) - return -EINVAL; - - return 0; + return offset >= -SZ_128M && offset < SZ_128M; } static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr) @@ -56,17 +51,58 @@ static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr) return NULL; } -static unsigned long get_plt_addr(struct module *mod, unsigned long addr) +/* + * Find the address the callsite must branch to in order to reach '*addr'. + * + * Due to the limited range of 'bl' instruction, modules may be placed too far + * away to branch directly and we must use a PLT. + * + * Returns true when '*addr' contains a reachable target address, or has been + * modified to contain a PLT address. Returns false otherwise. + */ +static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, struct module *mod, unsigned long *addr) { + unsigned long pc = rec->ip + LOONGARCH_INSN_SIZE; struct plt_entry *plt; - plt = get_ftrace_plt(mod, addr); - if (!plt) { - pr_err("ftrace: no module PLT for %ps\n", (void *)addr); - return -EINVAL; + /* + * When the target is within range of the 'bl' instruction, use 'addr' + * as-is and branch to that directly. + */ + if (reachable_by_bl(*addr, pc)) + return true; + + /* + * 'mod' is only set at module load time, but if we end up + * dealing with an out-of-range condition, we can assume it + * is due to a module being loaded far away from the kernel. + * + * NOTE: __module_text_address() must be called with preemption + * disabled, but we can rely on ftrace_lock to ensure that 'mod' + * retains its validity throughout the remainder of this code. + */ + if (!mod) { + preempt_disable(); + mod = __module_text_address(pc); + preempt_enable(); } - return (unsigned long)plt; + if (WARN_ON(!mod)) + return false; + + plt = get_ftrace_plt(mod, *addr); + if (!plt) { + pr_err("ftrace: no module PLT for %ps\n", (void *)*addr); + return false; + } + + *addr = (unsigned long)plt; + return true; +} +#else /* !CONFIG_MODULES */ +static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, struct module *mod, unsigned long *addr) +{ + return true; } #endif @@ -75,26 +111,14 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned { u32 old, new; unsigned long pc; - long offset __maybe_unused; pc = rec->ip + LOONGARCH_INSN_SIZE; -#ifdef CONFIG_MODULES - offset = (long)pc - (long)addr; + if (!ftrace_find_callable_addr(rec, NULL, &addr)) + return -EINVAL; - if (offset < -SZ_128M || offset >= SZ_128M) { - int ret; - struct module *mod; - - ret = __get_mod(&mod, pc); - if (ret) - return ret; - - addr = get_plt_addr(mod, addr); - - old_addr = get_plt_addr(mod, old_addr); - } -#endif + if (!ftrace_find_callable_addr(rec, NULL, &old_addr)) + return -EINVAL; new = larch_insn_gen_bl(pc, addr); old = larch_insn_gen_bl(pc, old_addr); @@ -151,24 +175,11 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { u32 old, new; unsigned long pc; - long offset __maybe_unused; pc = rec->ip + LOONGARCH_INSN_SIZE; -#ifdef CONFIG_MODULES - offset = (long)pc - (long)addr; - - if (offset < -SZ_128M || offset >= SZ_128M) { - int ret; - struct module *mod; - - ret = __get_mod(&mod, pc); - if (ret) - return ret; - - addr = get_plt_addr(mod, addr); - } -#endif + if (!ftrace_find_callable_addr(rec, NULL, &addr)) + return -EINVAL; old = larch_insn_gen_nop(); new = larch_insn_gen_bl(pc, addr); @@ -180,24 +191,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long ad { u32 old, new; unsigned long pc; - long offset __maybe_unused; pc = rec->ip + LOONGARCH_INSN_SIZE; -#ifdef CONFIG_MODULES - offset = (long)pc - (long)addr; - - if (offset < -SZ_128M || offset >= SZ_128M) { - int ret; - struct module *mod; - - ret = __get_mod(&mod, pc); - if (ret) - return ret; - - addr = get_plt_addr(mod, addr); - } -#endif + if (!ftrace_find_callable_addr(rec, NULL, &addr)) + return -EINVAL; new = larch_insn_gen_nop(); old = larch_insn_gen_bl(pc, addr); From 9cdc3b6a299c6314485bcfb695546c11d35dac4c Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Mon, 1 May 2023 17:19:53 +0800 Subject: [PATCH 22/24] LoongArch: ftrace: Add direct call support Select the HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS to provide the register_ftrace_direct[_multi] interfaces allowing users to register the customed trampoline (direct_caller) as the mcount for one or more target functions. And modify_ftrace_direct[_multi] are also provided for modifying direct_caller. There are a few cases to distinguish: - If a direct call ops is the only one tracing a function AND the direct called trampoline is within the reach of a 'bl' instruction -> the ftrace patchsite jumps to the trampoline - Else -> the ftrace patchsite jumps to the ftrace_regs_caller trampoline points to ftrace_list_ops so it iterates over all registered ftrace ops, including the direct call ops and calls its call_direct_funcs handler which stores the direct called trampoline's address in the ftrace_regs and the ftrace_regs_caller trampoline will return to that address instead of returning to the traced function Signed-off-by: Qing Zhang Signed-off-by: Youling Tang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/ftrace.h | 12 ++++++++++++ arch/loongarch/kernel/ftrace_dyn.c | 8 ++++++++ arch/loongarch/kernel/mcount_dyn.S | 13 ++++++++++++- 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 629a6dac33e1..3a79bde6bf5b 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -93,6 +93,7 @@ config LOONGARCH select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_ARGS + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h index aa5c57b5f333..23e2ba78dcb0 100644 --- a/arch/loongarch/include/asm/ftrace.h +++ b/arch/loongarch/include/asm/ftrace.h @@ -82,6 +82,18 @@ ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip) #define ftrace_graph_func ftrace_graph_func void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +static inline void +__arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + regs->regs[13] = addr; /* t1 */ +} + +#define arch_ftrace_set_direct_caller(fregs, addr) \ + __arch_ftrace_set_direct_caller(&(fregs)->regs, addr) +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c index eb624af93617..73858c9029cc 100644 --- a/arch/loongarch/kernel/ftrace_dyn.c +++ b/arch/loongarch/kernel/ftrace_dyn.c @@ -65,6 +65,14 @@ static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, struct module *mod unsigned long pc = rec->ip + LOONGARCH_INSN_SIZE; struct plt_entry *plt; + /* + * If a custom trampoline is unreachable, rely on the ftrace_regs_caller + * trampoline which knows how to indirectly reach that trampoline through + * ops->direct_call. + */ + if (*addr != FTRACE_ADDR && *addr != FTRACE_REGS_ADDR && !reachable_by_bl(*addr, pc)) + *addr = FTRACE_REGS_ADDR; + /* * When the target is within range of the 'bl' instruction, use 'addr' * as-is and branch to that directly. diff --git a/arch/loongarch/kernel/mcount_dyn.S b/arch/loongarch/kernel/mcount_dyn.S index bbabf06244c2..c7d961fc72c2 100644 --- a/arch/loongarch/kernel/mcount_dyn.S +++ b/arch/loongarch/kernel/mcount_dyn.S @@ -42,7 +42,6 @@ .if \allregs PTR_S tp, sp, PT_R2 PTR_S t0, sp, PT_R12 - PTR_S t1, sp, PT_R13 PTR_S t2, sp, PT_R14 PTR_S t3, sp, PT_R15 PTR_S t4, sp, PT_R16 @@ -64,6 +63,8 @@ PTR_S zero, sp, PT_R0 .endif PTR_S ra, sp, PT_ERA /* Save trace function ra at PT_ERA */ + move t1, zero + PTR_S t1, sp, PT_R13 PTR_ADDI t8, sp, PT_SIZE PTR_S t8, sp, PT_R3 .endm @@ -104,8 +105,12 @@ ftrace_common_return: PTR_L a7, sp, PT_R11 PTR_L fp, sp, PT_R22 PTR_L t0, sp, PT_ERA + PTR_L t1, sp, PT_R13 PTR_ADDI sp, sp, PT_SIZE + bnez t1, .Ldirect jr t0 +.Ldirect: + jr t1 SYM_CODE_END(ftrace_common) SYM_CODE_START(ftrace_caller) @@ -147,3 +152,9 @@ SYM_CODE_START(return_to_handler) jr ra SYM_CODE_END(return_to_handler) #endif + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_CODE_START(ftrace_stub_direct_tramp) + jr t0 +SYM_CODE_END(ftrace_stub_direct_tramp) +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ From 22f367a689ceceb08d9ce6a65c43c9640f5cb935 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Mon, 1 May 2023 17:19:53 +0800 Subject: [PATCH 23/24] LoongArch: ftrace: Add direct call trampoline samples support The ftrace samples need per-architecture trampoline implementations to save and restore argument registers around the calls to my_direct_func* and to restore polluted registers (e.g: ra). Signed-off-by: Qing Zhang Signed-off-by: Youling Tang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 2 + samples/ftrace/ftrace-direct-modify.c | 34 +++++++++++++++++ samples/ftrace/ftrace-direct-multi-modify.c | 41 +++++++++++++++++++++ samples/ftrace/ftrace-direct-multi.c | 25 +++++++++++++ samples/ftrace/ftrace-direct-too.c | 27 ++++++++++++++ samples/ftrace/ftrace-direct.c | 23 ++++++++++++ 6 files changed, 152 insertions(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 3a79bde6bf5b..ac3d3d9b5716 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -120,6 +120,8 @@ config LOONGARCH select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ + select HAVE_SAMPLE_FTRACE_DIRECT + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI select HAVE_SETUP_PER_CPU_AREA if NUMA select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index d93abbcb1f4c..ca72c3b710eb 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -96,6 +96,40 @@ asm ( #endif /* CONFIG_S390 */ +#ifdef CONFIG_LOONGARCH + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" +" addi.d $sp, $sp, -16\n" +" st.d $t0, $sp, 0\n" +" st.d $ra, $sp, 8\n" +" bl my_direct_func1\n" +" ld.d $t0, $sp, 0\n" +" ld.d $ra, $sp, 8\n" +" addi.d $sp, $sp, 16\n" +" jr $t0\n" +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:\n" +" addi.d $sp, $sp, -16\n" +" st.d $t0, $sp, 0\n" +" st.d $ra, $sp, 8\n" +" bl my_direct_func2\n" +" ld.d $t0, $sp, 0\n" +" ld.d $ra, $sp, 8\n" +" addi.d $sp, $sp, 16\n" +" jr $t0\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_LOONGARCH */ + static unsigned long my_tramp = (unsigned long)my_tramp1; static unsigned long tramps[2] = { (unsigned long)my_tramp1, diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c index b58c594efb51..4708c24d47c6 100644 --- a/samples/ftrace/ftrace-direct-multi-modify.c +++ b/samples/ftrace/ftrace-direct-multi-modify.c @@ -103,6 +103,47 @@ asm ( #endif /* CONFIG_S390 */ +#ifdef CONFIG_LOONGARCH +#include + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" +" addi.d $sp, $sp, -32\n" +" st.d $a0, $sp, 0\n" +" st.d $t0, $sp, 8\n" +" st.d $ra, $sp, 16\n" +" move $a0, $t0\n" +" bl my_direct_func1\n" +" ld.d $a0, $sp, 0\n" +" ld.d $t0, $sp, 8\n" +" ld.d $ra, $sp, 16\n" +" addi.d $sp, $sp, 32\n" +" jr $t0\n" +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:\n" +" addi.d $sp, $sp, -32\n" +" st.d $a0, $sp, 0\n" +" st.d $t0, $sp, 8\n" +" st.d $ra, $sp, 16\n" +" move $a0, $t0\n" +" bl my_direct_func2\n" +" ld.d $a0, $sp, 0\n" +" ld.d $t0, $sp, 8\n" +" ld.d $ra, $sp, 16\n" +" addi.d $sp, $sp, 32\n" +" jr $t0\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_LOONGARCH */ + static unsigned long my_tramp = (unsigned long)my_tramp1; static unsigned long tramps[2] = { (unsigned long)my_tramp1, diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c index c27cf130c319..c2f1652c67bc 100644 --- a/samples/ftrace/ftrace-direct-multi.c +++ b/samples/ftrace/ftrace-direct-multi.c @@ -66,6 +66,31 @@ asm ( #endif /* CONFIG_S390 */ +#ifdef CONFIG_LOONGARCH + +#include +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi.d $sp, $sp, -32\n" +" st.d $a0, $sp, 0\n" +" st.d $t0, $sp, 8\n" +" st.d $ra, $sp, 16\n" +" move $a0, $t0\n" +" bl my_direct_func\n" +" ld.d $a0, $sp, 0\n" +" ld.d $t0, $sp, 8\n" +" ld.d $ra, $sp, 16\n" +" addi.d $sp, $sp, 32\n" +" jr $t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_LOONGARCH */ + static struct ftrace_ops direct; static int __init ftrace_direct_multi_init(void) diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index 8139dce2a31c..ef64d7509773 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -70,6 +70,33 @@ asm ( #endif /* CONFIG_S390 */ +#ifdef CONFIG_LOONGARCH + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi.d $sp, $sp, -48\n" +" st.d $a0, $sp, 0\n" +" st.d $a1, $sp, 8\n" +" st.d $a2, $sp, 16\n" +" st.d $t0, $sp, 24\n" +" st.d $ra, $sp, 32\n" +" bl my_direct_func\n" +" ld.d $a0, $sp, 0\n" +" ld.d $a1, $sp, 8\n" +" ld.d $a2, $sp, 16\n" +" ld.d $t0, $sp, 24\n" +" ld.d $ra, $sp, 32\n" +" addi.d $sp, $sp, 48\n" +" jr $t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_LOONGARCH */ + static int __init ftrace_direct_init(void) { return register_ftrace_direct((unsigned long)handle_mm_fault, diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index 1d3d307ca33d..9be720957bf8 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -63,6 +63,29 @@ asm ( #endif /* CONFIG_S390 */ +#ifdef CONFIG_LOONGARCH + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi.d $sp, $sp, -32\n" +" st.d $a0, $sp, 0\n" +" st.d $t0, $sp, 8\n" +" st.d $ra, $sp, 16\n" +" bl my_direct_func\n" +" ld.d $a0, $sp, 0\n" +" ld.d $t0, $sp, 8\n" +" ld.d $ra, $sp, 16\n" +" addi.d $sp, $sp, 32\n" +" jr $t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_LOONGARCH */ + static int __init ftrace_direct_init(void) { return register_ftrace_direct((unsigned long)wake_up_process, From 2fa5ebe3bc4e31e07a99196455498472417842f2 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 1 May 2023 17:19:59 +0800 Subject: [PATCH 24/24] tools/perf: Add basic support for LoongArch Add basic support for LoongArch, which is very similar to the MIPS version. Signed-off-by: Ming Wang Signed-off-by: Huacai Chen --- .../loongarch/include/uapi/asm/perf_regs.h | 40 +++++++++ .../arch/loongarch/include/uapi/asm/unistd.h | 9 ++ tools/perf/Makefile.config | 12 ++- tools/perf/arch/loongarch/Build | 1 + tools/perf/arch/loongarch/Makefile | 28 +++++++ .../arch/loongarch/annotate/instructions.c | 45 ++++++++++ .../loongarch/entry/syscalls/mksyscalltbl | 61 ++++++++++++++ .../arch/loongarch/include/dwarf-regs-table.h | 16 ++++ tools/perf/arch/loongarch/include/perf_regs.h | 15 ++++ tools/perf/arch/loongarch/util/Build | 5 ++ tools/perf/arch/loongarch/util/dwarf-regs.c | 44 ++++++++++ tools/perf/arch/loongarch/util/perf_regs.c | 6 ++ tools/perf/arch/loongarch/util/unwind-libdw.c | 56 +++++++++++++ .../arch/loongarch/util/unwind-libunwind.c | 82 +++++++++++++++++++ tools/perf/check-headers.sh | 1 + tools/perf/util/annotate.c | 8 ++ tools/perf/util/dwarf-regs.c | 7 ++ tools/perf/util/env.c | 2 + tools/perf/util/genelf.h | 3 + tools/perf/util/perf_regs.c | 76 +++++++++++++++++ tools/perf/util/syscalltbl.c | 4 + 21 files changed, 518 insertions(+), 3 deletions(-) create mode 100644 tools/arch/loongarch/include/uapi/asm/perf_regs.h create mode 100644 tools/arch/loongarch/include/uapi/asm/unistd.h create mode 100644 tools/perf/arch/loongarch/Build create mode 100644 tools/perf/arch/loongarch/Makefile create mode 100644 tools/perf/arch/loongarch/annotate/instructions.c create mode 100755 tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl create mode 100644 tools/perf/arch/loongarch/include/dwarf-regs-table.h create mode 100644 tools/perf/arch/loongarch/include/perf_regs.h create mode 100644 tools/perf/arch/loongarch/util/Build create mode 100644 tools/perf/arch/loongarch/util/dwarf-regs.c create mode 100644 tools/perf/arch/loongarch/util/perf_regs.c create mode 100644 tools/perf/arch/loongarch/util/unwind-libdw.c create mode 100644 tools/perf/arch/loongarch/util/unwind-libunwind.c diff --git a/tools/arch/loongarch/include/uapi/asm/perf_regs.h b/tools/arch/loongarch/include/uapi/asm/perf_regs.h new file mode 100644 index 000000000000..29d69c00fc7a --- /dev/null +++ b/tools/arch/loongarch/include/uapi/asm/perf_regs.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_LOONGARCH_PERF_REGS_H +#define _ASM_LOONGARCH_PERF_REGS_H + +enum perf_event_loongarch_regs { + PERF_REG_LOONGARCH_PC, + PERF_REG_LOONGARCH_R1, + PERF_REG_LOONGARCH_R2, + PERF_REG_LOONGARCH_R3, + PERF_REG_LOONGARCH_R4, + PERF_REG_LOONGARCH_R5, + PERF_REG_LOONGARCH_R6, + PERF_REG_LOONGARCH_R7, + PERF_REG_LOONGARCH_R8, + PERF_REG_LOONGARCH_R9, + PERF_REG_LOONGARCH_R10, + PERF_REG_LOONGARCH_R11, + PERF_REG_LOONGARCH_R12, + PERF_REG_LOONGARCH_R13, + PERF_REG_LOONGARCH_R14, + PERF_REG_LOONGARCH_R15, + PERF_REG_LOONGARCH_R16, + PERF_REG_LOONGARCH_R17, + PERF_REG_LOONGARCH_R18, + PERF_REG_LOONGARCH_R19, + PERF_REG_LOONGARCH_R20, + PERF_REG_LOONGARCH_R21, + PERF_REG_LOONGARCH_R22, + PERF_REG_LOONGARCH_R23, + PERF_REG_LOONGARCH_R24, + PERF_REG_LOONGARCH_R25, + PERF_REG_LOONGARCH_R26, + PERF_REG_LOONGARCH_R27, + PERF_REG_LOONGARCH_R28, + PERF_REG_LOONGARCH_R29, + PERF_REG_LOONGARCH_R30, + PERF_REG_LOONGARCH_R31, + PERF_REG_LOONGARCH_MAX, +}; +#endif /* _ASM_LOONGARCH_PERF_REGS_H */ diff --git a/tools/arch/loongarch/include/uapi/asm/unistd.h b/tools/arch/loongarch/include/uapi/asm/unistd.h new file mode 100644 index 000000000000..0c743344e92d --- /dev/null +++ b/tools/arch/loongarch/include/uapi/asm/unistd.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 + +#include diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 3519a0139026..c0a208f9b67b 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -38,7 +38,7 @@ ifneq ($(NO_SYSCALL_TABLE),1) NO_SYSCALL_TABLE := 0 endif else - ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390 mips)) + ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390 mips loongarch)) NO_SYSCALL_TABLE := 0 endif endif @@ -80,6 +80,12 @@ ifeq ($(SRCARCH),arm64) LIBUNWIND_LIBS = -lunwind -lunwind-aarch64 endif +ifeq ($(SRCARCH),loongarch) + NO_PERF_REGS := 0 + CFLAGS += -I$(OUTPUT)arch/loongarch/include/generated + LIBUNWIND_LIBS = -lunwind -lunwind-loongarch64 +endif + ifeq ($(SRCARCH),riscv) NO_PERF_REGS := 0 endif @@ -107,7 +113,7 @@ endif # Disable it on all other architectures in case libdw unwind # support is detected in system. Add supported architectures # to the check. -ifneq ($(SRCARCH),$(filter $(SRCARCH),x86 arm arm64 powerpc s390 csky riscv)) +ifneq ($(SRCARCH),$(filter $(SRCARCH),x86 arm arm64 powerpc s390 csky riscv loongarch)) NO_LIBDW_DWARF_UNWIND := 1 endif @@ -129,7 +135,7 @@ endef ifdef LIBUNWIND_DIR LIBUNWIND_CFLAGS = -I$(LIBUNWIND_DIR)/include LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib - LIBUNWIND_ARCHS = x86 x86_64 arm aarch64 debug-frame-arm debug-frame-aarch64 + LIBUNWIND_ARCHS = x86 x86_64 arm aarch64 debug-frame-arm debug-frame-aarch64 loongarch $(foreach libunwind_arch,$(LIBUNWIND_ARCHS),$(call libunwind_arch_set_flags,$(libunwind_arch))) endif diff --git a/tools/perf/arch/loongarch/Build b/tools/perf/arch/loongarch/Build new file mode 100644 index 000000000000..e4e5f33c84d8 --- /dev/null +++ b/tools/perf/arch/loongarch/Build @@ -0,0 +1 @@ +perf-y += util/ diff --git a/tools/perf/arch/loongarch/Makefile b/tools/perf/arch/loongarch/Makefile new file mode 100644 index 000000000000..c392e7af4743 --- /dev/null +++ b/tools/perf/arch/loongarch/Makefile @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0 +ifndef NO_DWARF +PERF_HAVE_DWARF_REGS := 1 +endif +PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1 +PERF_HAVE_JITDUMP := 1 + +# +# Syscall table generation for perf +# + +out := $(OUTPUT)arch/loongarch/include/generated/asm +header := $(out)/syscalls.c +incpath := $(srctree)/tools +sysdef := $(srctree)/tools/arch/loongarch/include/uapi/asm/unistd.h +sysprf := $(srctree)/tools/perf/arch/loongarch/entry/syscalls/ +systbl := $(sysprf)/mksyscalltbl + +# Create output directory if not already present +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') + +$(header): $(sysdef) $(systbl) + $(Q)$(SHELL) '$(systbl)' '$(CC)' '$(HOSTCC)' $(incpath) $(sysdef) > $@ + +clean:: + $(call QUIET_CLEAN, loongarch) $(RM) $(header) + +archheaders: $(header) diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c new file mode 100644 index 000000000000..ab21bf122135 --- /dev/null +++ b/tools/perf/arch/loongarch/annotate/instructions.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Perf annotate functions. + * + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +static +struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name) +{ + struct ins_ops *ops = NULL; + + if (!strncmp(name, "beqz", 4) || + !strncmp(name, "bnez", 4) || + !strncmp(name, "beq", 3) || + !strncmp(name, "bne", 3) || + !strncmp(name, "blt", 3) || + !strncmp(name, "bge", 3) || + !strncmp(name, "bltu", 4) || + !strncmp(name, "bgeu", 4) || + !strncmp(name, "bl", 2)) + ops = &call_ops; + else if (!strncmp(name, "jirl", 4)) + ops = &ret_ops; + else if (name[0] == 'b') + ops = &jump_ops; + else + return NULL; + + arch__associate_ins_ops(arch, name, ops); + + return ops; +} + +static +int loongarch__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + if (!arch->initialized) { + arch->associate_instruction_ops = loongarch__associate_ins_ops; + arch->initialized = true; + arch->objdump.comment_char = '#'; + } + + return 0; +} diff --git a/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl b/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl new file mode 100755 index 000000000000..c52156f7204d --- /dev/null +++ b/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl @@ -0,0 +1,61 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Generate system call table for perf. Derived from +# powerpc script. +# +# Author(s): Ming Wang +# Author(s): Huacai Chen +# Copyright (C) 2020-2023 Loongson Technology Corporation Limited + +gcc=$1 +hostcc=$2 +incpath=$3 +input=$4 + +if ! test -r $input; then + echo "Could not read input file" >&2 + exit 1 +fi + +create_table_from_c() +{ + local sc nr last_sc + + create_table_exe=`mktemp ${TMPDIR:-/tmp}/create-table-XXXXXX` + + { + + cat <<-_EoHEADER + #include + #include "$input" + int main(int argc, char *argv[]) + { + _EoHEADER + + while read sc nr; do + printf "%s\n" " printf(\"\\t[%d] = \\\"$sc\\\",\\n\", $nr);" + last_sc=$nr + done + + printf "%s\n" " printf(\"#define SYSCALLTBL_LOONGARCH_MAX_ID %d\\n\", $last_sc);" + printf "}\n" + + } | $hostcc -I $incpath/include/uapi -o $create_table_exe -x c - + + $create_table_exe + + rm -f $create_table_exe +} + +create_table() +{ + echo "static const char *syscalltbl_loongarch[] = {" + create_table_from_c + echo "};" +} + +$gcc -E -dM -x c -I $incpath/include/uapi $input \ + |sed -ne 's/^#define __NR_//p' \ + |sort -t' ' -k2 -n \ + |create_table diff --git a/tools/perf/arch/loongarch/include/dwarf-regs-table.h b/tools/perf/arch/loongarch/include/dwarf-regs-table.h new file mode 100644 index 000000000000..bb3944f5764a --- /dev/null +++ b/tools/perf/arch/loongarch/include/dwarf-regs-table.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * dwarf-regs-table.h : Mapping of DWARF debug register numbers into + * register names. + * + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#ifdef DEFINE_DWARF_REGSTR_TABLE +static const char * const loongarch_regstr_tbl[] = { + "%r0", "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", + "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", + "%r16", "%r17", "%r18", "%r19", "%r20", "%r21", "%r22", "%r23", + "%r24", "%r25", "%r26", "%r27", "%r28", "%r29", "%r30", "%r31", +}; +#endif diff --git a/tools/perf/arch/loongarch/include/perf_regs.h b/tools/perf/arch/loongarch/include/perf_regs.h new file mode 100644 index 000000000000..7833c7dbd38d --- /dev/null +++ b/tools/perf/arch/loongarch/include/perf_regs.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_PERF_REGS_H +#define ARCH_PERF_REGS_H + +#include +#include +#include + +#define PERF_REGS_MAX PERF_REG_LOONGARCH_MAX +#define PERF_REG_IP PERF_REG_LOONGARCH_PC +#define PERF_REG_SP PERF_REG_LOONGARCH_R3 + +#define PERF_REGS_MASK ((1ULL << PERF_REG_LOONGARCH_MAX) - 1) + +#endif /* ARCH_PERF_REGS_H */ diff --git a/tools/perf/arch/loongarch/util/Build b/tools/perf/arch/loongarch/util/Build new file mode 100644 index 000000000000..d776125a2d06 --- /dev/null +++ b/tools/perf/arch/loongarch/util/Build @@ -0,0 +1,5 @@ +perf-y += perf_regs.o + +perf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o +perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/loongarch/util/dwarf-regs.c b/tools/perf/arch/loongarch/util/dwarf-regs.c new file mode 100644 index 000000000000..0f6ebc387463 --- /dev/null +++ b/tools/perf/arch/loongarch/util/dwarf-regs.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * dwarf-regs.c : Mapping of DWARF debug register numbers into register names. + * + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include +#include /* for EINVAL */ +#include /* for strcmp */ +#include + +struct pt_regs_dwarfnum { + const char *name; + unsigned int dwarfnum; +}; + +static struct pt_regs_dwarfnum loongarch_gpr_table[] = { + {"%r0", 0}, {"%r1", 1}, {"%r2", 2}, {"%r3", 3}, + {"%r4", 4}, {"%r5", 5}, {"%r6", 6}, {"%r7", 7}, + {"%r8", 8}, {"%r9", 9}, {"%r10", 10}, {"%r11", 11}, + {"%r12", 12}, {"%r13", 13}, {"%r14", 14}, {"%r15", 15}, + {"%r16", 16}, {"%r17", 17}, {"%r18", 18}, {"%r19", 19}, + {"%r20", 20}, {"%r21", 21}, {"%r22", 22}, {"%r23", 23}, + {"%r24", 24}, {"%r25", 25}, {"%r26", 26}, {"%r27", 27}, + {"%r28", 28}, {"%r29", 29}, {"%r30", 30}, {"%r31", 31}, + {NULL, 0} +}; + +const char *get_arch_regstr(unsigned int n) +{ + n %= 32; + return loongarch_gpr_table[n].name; +} + +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_dwarfnum *roff; + + for (roff = loongarch_gpr_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->dwarfnum; + return -EINVAL; +} diff --git a/tools/perf/arch/loongarch/util/perf_regs.c b/tools/perf/arch/loongarch/util/perf_regs.c new file mode 100644 index 000000000000..2833e101a7c6 --- /dev/null +++ b/tools/perf/arch/loongarch/util/perf_regs.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../../util/perf_regs.h" + +const struct sample_reg sample_reg_masks[] = { + SMPL_REG_END +}; diff --git a/tools/perf/arch/loongarch/util/unwind-libdw.c b/tools/perf/arch/loongarch/util/unwind-libdw.c new file mode 100644 index 000000000000..a9415385230a --- /dev/null +++ b/tools/perf/arch/loongarch/util/unwind-libdw.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2020-2023 Loongson Technology Corporation Limited */ + +#include +#include "../../util/unwind-libdw.h" +#include "../../util/perf_regs.h" +#include "../../util/sample.h" + +bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = &ui->sample->user_regs; + Dwarf_Word dwarf_regs[PERF_REG_LOONGARCH_MAX]; + +#define REG(r) ({ \ + Dwarf_Word val = 0; \ + perf_reg_value(&val, user_regs, PERF_REG_LOONGARCH_##r); \ + val; \ +}) + + dwarf_regs[0] = 0; + dwarf_regs[1] = REG(R1); + dwarf_regs[2] = REG(R2); + dwarf_regs[3] = REG(R3); + dwarf_regs[4] = REG(R4); + dwarf_regs[5] = REG(R5); + dwarf_regs[6] = REG(R6); + dwarf_regs[7] = REG(R7); + dwarf_regs[8] = REG(R8); + dwarf_regs[9] = REG(R9); + dwarf_regs[10] = REG(R10); + dwarf_regs[11] = REG(R11); + dwarf_regs[12] = REG(R12); + dwarf_regs[13] = REG(R13); + dwarf_regs[14] = REG(R14); + dwarf_regs[15] = REG(R15); + dwarf_regs[16] = REG(R16); + dwarf_regs[17] = REG(R17); + dwarf_regs[18] = REG(R18); + dwarf_regs[19] = REG(R19); + dwarf_regs[20] = REG(R20); + dwarf_regs[21] = REG(R21); + dwarf_regs[22] = REG(R22); + dwarf_regs[23] = REG(R23); + dwarf_regs[24] = REG(R24); + dwarf_regs[25] = REG(R25); + dwarf_regs[26] = REG(R26); + dwarf_regs[27] = REG(R27); + dwarf_regs[28] = REG(R28); + dwarf_regs[29] = REG(R29); + dwarf_regs[30] = REG(R30); + dwarf_regs[31] = REG(R31); + dwfl_thread_state_register_pc(thread, REG(PC)); + + return dwfl_thread_state_registers(thread, 0, PERF_REG_LOONGARCH_MAX, dwarf_regs); +} diff --git a/tools/perf/arch/loongarch/util/unwind-libunwind.c b/tools/perf/arch/loongarch/util/unwind-libunwind.c new file mode 100644 index 000000000000..f693167b86ef --- /dev/null +++ b/tools/perf/arch/loongarch/util/unwind-libunwind.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "perf_regs.h" +#include "../../util/unwind.h" +#include "util/debug.h" + +int libunwind__arch_reg_id(int regnum) +{ + switch (regnum) { + case UNW_LOONGARCH64_R1: + return PERF_REG_LOONGARCH_R1; + case UNW_LOONGARCH64_R2: + return PERF_REG_LOONGARCH_R2; + case UNW_LOONGARCH64_R3: + return PERF_REG_LOONGARCH_R3; + case UNW_LOONGARCH64_R4: + return PERF_REG_LOONGARCH_R4; + case UNW_LOONGARCH64_R5: + return PERF_REG_LOONGARCH_R5; + case UNW_LOONGARCH64_R6: + return PERF_REG_LOONGARCH_R6; + case UNW_LOONGARCH64_R7: + return PERF_REG_LOONGARCH_R7; + case UNW_LOONGARCH64_R8: + return PERF_REG_LOONGARCH_R8; + case UNW_LOONGARCH64_R9: + return PERF_REG_LOONGARCH_R9; + case UNW_LOONGARCH64_R10: + return PERF_REG_LOONGARCH_R10; + case UNW_LOONGARCH64_R11: + return PERF_REG_LOONGARCH_R11; + case UNW_LOONGARCH64_R12: + return PERF_REG_LOONGARCH_R12; + case UNW_LOONGARCH64_R13: + return PERF_REG_LOONGARCH_R13; + case UNW_LOONGARCH64_R14: + return PERF_REG_LOONGARCH_R14; + case UNW_LOONGARCH64_R15: + return PERF_REG_LOONGARCH_R15; + case UNW_LOONGARCH64_R16: + return PERF_REG_LOONGARCH_R16; + case UNW_LOONGARCH64_R17: + return PERF_REG_LOONGARCH_R17; + case UNW_LOONGARCH64_R18: + return PERF_REG_LOONGARCH_R18; + case UNW_LOONGARCH64_R19: + return PERF_REG_LOONGARCH_R19; + case UNW_LOONGARCH64_R20: + return PERF_REG_LOONGARCH_R20; + case UNW_LOONGARCH64_R21: + return PERF_REG_LOONGARCH_R21; + case UNW_LOONGARCH64_R22: + return PERF_REG_LOONGARCH_R22; + case UNW_LOONGARCH64_R23: + return PERF_REG_LOONGARCH_R23; + case UNW_LOONGARCH64_R24: + return PERF_REG_LOONGARCH_R24; + case UNW_LOONGARCH64_R25: + return PERF_REG_LOONGARCH_R25; + case UNW_LOONGARCH64_R26: + return PERF_REG_LOONGARCH_R26; + case UNW_LOONGARCH64_R27: + return PERF_REG_LOONGARCH_R27; + case UNW_LOONGARCH64_R28: + return PERF_REG_LOONGARCH_R28; + case UNW_LOONGARCH64_R29: + return PERF_REG_LOONGARCH_R29; + case UNW_LOONGARCH64_R30: + return PERF_REG_LOONGARCH_R30; + case UNW_LOONGARCH64_R31: + return PERF_REG_LOONGARCH_R31; + case UNW_LOONGARCH64_PC: + return PERF_REG_LOONGARCH_PC; + default: + pr_err("unwind: invalid reg id %d\n", regnum); + return -EINVAL; + } + + return -EINVAL; +} diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index eacca9a874e2..9d6232f681ce 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -40,6 +40,7 @@ arch/x86/lib/x86-opcode-map.txt arch/x86/tools/gen-insn-attr-x86.awk arch/arm/include/uapi/asm/perf_regs.h arch/arm64/include/uapi/asm/perf_regs.h +arch/loongarch/include/uapi/asm/perf_regs.h arch/mips/include/uapi/asm/perf_regs.h arch/powerpc/include/uapi/asm/perf_regs.h arch/s390/include/uapi/asm/perf_regs.h diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index db475e44f42f..0cc7710f32da 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -149,6 +149,7 @@ static int arch__associate_ins_ops(struct arch* arch, const char *name, struct i #include "arch/arm/annotate/instructions.c" #include "arch/arm64/annotate/instructions.c" #include "arch/csky/annotate/instructions.c" +#include "arch/loongarch/annotate/instructions.c" #include "arch/mips/annotate/instructions.c" #include "arch/x86/annotate/instructions.c" #include "arch/powerpc/annotate/instructions.c" @@ -211,6 +212,13 @@ static struct arch architectures[] = { .comment_char = '#', }, }, + { + .name = "loongarch", + .init = loongarch__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, }; static void ins__delete(struct ins_operands *ops) diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index 3fa4486742cd..69cfaa5953bf 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -14,6 +14,10 @@ #define EM_AARCH64 183 /* ARM 64 bit */ #endif +#ifndef EM_LOONGARCH +#define EM_LOONGARCH 258 /* LoongArch */ +#endif + /* Define const char * {arch}_register_tbl[] */ #define DEFINE_DWARF_REGSTR_TABLE #include "../arch/x86/include/dwarf-regs-table.h" @@ -25,6 +29,7 @@ #include "../arch/sparc/include/dwarf-regs-table.h" #include "../arch/xtensa/include/dwarf-regs-table.h" #include "../arch/mips/include/dwarf-regs-table.h" +#include "../arch/loongarch/include/dwarf-regs-table.h" #define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL) @@ -56,6 +61,8 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine) return __get_dwarf_regstr(xtensa_regstr_tbl, n); case EM_MIPS: return __get_dwarf_regstr(mips_regstr_tbl, n); + case EM_LOONGARCH: + return __get_dwarf_regstr(loongarch_regstr_tbl, n); default: pr_err("ELF MACHINE %x is not supported.\n", machine); } diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 5b8cf6a421a4..0d5d40cb997b 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -435,6 +435,8 @@ static const char *normalize_arch(char *arch) return "mips"; if (!strncmp(arch, "sh", 2) && isdigit(arch[2])) return "sh"; + if (!strncmp(arch, "loongarch", 9)) + return "loongarch"; return arch; } diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h index 6af062d1c452..5f18d20ea903 100644 --- a/tools/perf/util/genelf.h +++ b/tools/perf/util/genelf.h @@ -43,6 +43,9 @@ int jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_ent #elif defined(__riscv) && __riscv_xlen == 64 #define GEN_ELF_ARCH EM_RISCV #define GEN_ELF_CLASS ELFCLASS64 +#elif defined(__loongarch__) +#define GEN_ELF_ARCH EM_LOONGARCH +#define GEN_ELF_CLASS ELFCLASS64 #else #error "unsupported architecture" #endif diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c index 57a567ee2cea..9bdbaa37f813 100644 --- a/tools/perf/util/perf_regs.c +++ b/tools/perf/util/perf_regs.c @@ -28,6 +28,7 @@ uint64_t __weak arch__user_reg_mask(void) #include "../../arch/arm/include/uapi/asm/perf_regs.h" #include "../../arch/csky/include/uapi/asm/perf_regs.h" +#include "../../arch/loongarch/include/uapi/asm/perf_regs.h" #include "../../arch/mips/include/uapi/asm/perf_regs.h" #include "../../arch/powerpc/include/uapi/asm/perf_regs.h" #include "../../arch/riscv/include/uapi/asm/perf_regs.h" @@ -236,6 +237,79 @@ static const char *__perf_reg_name_csky(int id) return NULL; } +static inline const char *__perf_reg_name_loongarch(int id) +{ + switch (id) { + case PERF_REG_LOONGARCH_PC: + return "PC"; + case PERF_REG_LOONGARCH_R1: + return "%r1"; + case PERF_REG_LOONGARCH_R2: + return "%r2"; + case PERF_REG_LOONGARCH_R3: + return "%r3"; + case PERF_REG_LOONGARCH_R4: + return "%r4"; + case PERF_REG_LOONGARCH_R5: + return "%r5"; + case PERF_REG_LOONGARCH_R6: + return "%r6"; + case PERF_REG_LOONGARCH_R7: + return "%r7"; + case PERF_REG_LOONGARCH_R8: + return "%r8"; + case PERF_REG_LOONGARCH_R9: + return "%r9"; + case PERF_REG_LOONGARCH_R10: + return "%r10"; + case PERF_REG_LOONGARCH_R11: + return "%r11"; + case PERF_REG_LOONGARCH_R12: + return "%r12"; + case PERF_REG_LOONGARCH_R13: + return "%r13"; + case PERF_REG_LOONGARCH_R14: + return "%r14"; + case PERF_REG_LOONGARCH_R15: + return "%r15"; + case PERF_REG_LOONGARCH_R16: + return "%r16"; + case PERF_REG_LOONGARCH_R17: + return "%r17"; + case PERF_REG_LOONGARCH_R18: + return "%r18"; + case PERF_REG_LOONGARCH_R19: + return "%r19"; + case PERF_REG_LOONGARCH_R20: + return "%r20"; + case PERF_REG_LOONGARCH_R21: + return "%r21"; + case PERF_REG_LOONGARCH_R22: + return "%r22"; + case PERF_REG_LOONGARCH_R23: + return "%r23"; + case PERF_REG_LOONGARCH_R24: + return "%r24"; + case PERF_REG_LOONGARCH_R25: + return "%r25"; + case PERF_REG_LOONGARCH_R26: + return "%r26"; + case PERF_REG_LOONGARCH_R27: + return "%r27"; + case PERF_REG_LOONGARCH_R28: + return "%r28"; + case PERF_REG_LOONGARCH_R29: + return "%r29"; + case PERF_REG_LOONGARCH_R30: + return "%r30"; + case PERF_REG_LOONGARCH_R31: + return "%r31"; + default: + break; + } + return NULL; +} + static const char *__perf_reg_name_mips(int id) { switch (id) { @@ -670,6 +744,8 @@ const char *perf_reg_name(int id, const char *arch) if (!strcmp(arch, "csky")) reg_name = __perf_reg_name_csky(id); + else if (!strcmp(arch, "loongarch")) + reg_name = __perf_reg_name_loongarch(id); else if (!strcmp(arch, "mips")) reg_name = __perf_reg_name_mips(id); else if (!strcmp(arch, "powerpc")) diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index a2e906858891..313eccef6cb4 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -38,6 +38,10 @@ static const char **syscalltbl_native = syscalltbl_arm64; #include const int syscalltbl_native_max_id = SYSCALLTBL_MIPS_N64_MAX_ID; static const char **syscalltbl_native = syscalltbl_mips_n64; +#elif defined(__loongarch__) +#include +const int syscalltbl_native_max_id = SYSCALLTBL_LOONGARCH_MAX_ID; +static const char **syscalltbl_native = syscalltbl_loongarch; #endif struct syscall {