From 30a1b5ef0c953a7ba82169c46f92f52fa11ee15e Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 8 Feb 2013 23:02:33 +0100 Subject: [PATCH 01/15] ARM: 7642/1: netx: bump IRQ offset to 64 The Netx IRQs offset from zero, which is illegal, since Linux IRQ 0 is NO_IRQ. Acked-by: Sascha Hauer Reviewed-by: Jamie Iles Signed-off-by: Linus Walleij Signed-off-by: Russell King --- arch/arm/mach-netx/generic.c | 2 +- arch/arm/mach-netx/include/mach/irqs.h | 64 +++++++++++++------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/arm/mach-netx/generic.c b/arch/arm/mach-netx/generic.c index 27c2cb7ab813..1504b68f4c66 100644 --- a/arch/arm/mach-netx/generic.c +++ b/arch/arm/mach-netx/generic.c @@ -168,7 +168,7 @@ void __init netx_init_irq(void) { int irq; - vic_init(io_p2v(NETX_PA_VIC), 0, ~0, 0); + vic_init(io_p2v(NETX_PA_VIC), NETX_IRQ_VIC_START, ~0, 0); for (irq = NETX_IRQ_HIF_CHAINED(0); irq <= NETX_IRQ_HIF_LAST; irq++) { irq_set_chip_and_handler(irq, &netx_hif_chip, diff --git a/arch/arm/mach-netx/include/mach/irqs.h b/arch/arm/mach-netx/include/mach/irqs.h index 6ce914d54a30..8f74a844a775 100644 --- a/arch/arm/mach-netx/include/mach/irqs.h +++ b/arch/arm/mach-netx/include/mach/irqs.h @@ -17,42 +17,42 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#define NETX_IRQ_VIC_START 0 -#define NETX_IRQ_SOFTINT 0 -#define NETX_IRQ_TIMER0 1 -#define NETX_IRQ_TIMER1 2 -#define NETX_IRQ_TIMER2 3 -#define NETX_IRQ_SYSTIME_NS 4 -#define NETX_IRQ_SYSTIME_S 5 -#define NETX_IRQ_GPIO_15 6 -#define NETX_IRQ_WATCHDOG 7 -#define NETX_IRQ_UART0 8 -#define NETX_IRQ_UART1 9 -#define NETX_IRQ_UART2 10 -#define NETX_IRQ_USB 11 -#define NETX_IRQ_SPI 12 -#define NETX_IRQ_I2C 13 -#define NETX_IRQ_LCD 14 -#define NETX_IRQ_HIF 15 -#define NETX_IRQ_GPIO_0_14 16 -#define NETX_IRQ_XPEC0 17 -#define NETX_IRQ_XPEC1 18 -#define NETX_IRQ_XPEC2 19 -#define NETX_IRQ_XPEC3 20 -#define NETX_IRQ_XPEC(no) (17 + (no)) -#define NETX_IRQ_MSYNC0 21 -#define NETX_IRQ_MSYNC1 22 -#define NETX_IRQ_MSYNC2 23 -#define NETX_IRQ_MSYNC3 24 -#define NETX_IRQ_IRQ_PHY 25 -#define NETX_IRQ_ISO_AREA 26 +#define NETX_IRQ_VIC_START 64 +#define NETX_IRQ_SOFTINT (NETX_IRQ_VIC_START + 0) +#define NETX_IRQ_TIMER0 (NETX_IRQ_VIC_START + 1) +#define NETX_IRQ_TIMER1 (NETX_IRQ_VIC_START + 2) +#define NETX_IRQ_TIMER2 (NETX_IRQ_VIC_START + 3) +#define NETX_IRQ_SYSTIME_NS (NETX_IRQ_VIC_START + 4) +#define NETX_IRQ_SYSTIME_S (NETX_IRQ_VIC_START + 5) +#define NETX_IRQ_GPIO_15 (NETX_IRQ_VIC_START + 6) +#define NETX_IRQ_WATCHDOG (NETX_IRQ_VIC_START + 7) +#define NETX_IRQ_UART0 (NETX_IRQ_VIC_START + 8) +#define NETX_IRQ_UART1 (NETX_IRQ_VIC_START + 9) +#define NETX_IRQ_UART2 (NETX_IRQ_VIC_START + 10) +#define NETX_IRQ_USB (NETX_IRQ_VIC_START + 11) +#define NETX_IRQ_SPI (NETX_IRQ_VIC_START + 12) +#define NETX_IRQ_I2C (NETX_IRQ_VIC_START + 13) +#define NETX_IRQ_LCD (NETX_IRQ_VIC_START + 14) +#define NETX_IRQ_HIF (NETX_IRQ_VIC_START + 15) +#define NETX_IRQ_GPIO_0_14 (NETX_IRQ_VIC_START + 16) +#define NETX_IRQ_XPEC0 (NETX_IRQ_VIC_START + 17) +#define NETX_IRQ_XPEC1 (NETX_IRQ_VIC_START + 18) +#define NETX_IRQ_XPEC2 (NETX_IRQ_VIC_START + 19) +#define NETX_IRQ_XPEC3 (NETX_IRQ_VIC_START + 20) +#define NETX_IRQ_XPEC(no) (NETX_IRQ_VIC_START + 17 + (no)) +#define NETX_IRQ_MSYNC0 (NETX_IRQ_VIC_START + 21) +#define NETX_IRQ_MSYNC1 (NETX_IRQ_VIC_START + 22) +#define NETX_IRQ_MSYNC2 (NETX_IRQ_VIC_START + 23) +#define NETX_IRQ_MSYNC3 (NETX_IRQ_VIC_START + 24) +#define NETX_IRQ_IRQ_PHY (NETX_IRQ_VIC_START + 25) +#define NETX_IRQ_ISO_AREA (NETX_IRQ_VIC_START + 26) /* int 27 is reserved */ /* int 28 is reserved */ -#define NETX_IRQ_TIMER3 29 -#define NETX_IRQ_TIMER4 30 +#define NETX_IRQ_TIMER3 (NETX_IRQ_VIC_START + 29) +#define NETX_IRQ_TIMER4 (NETX_IRQ_VIC_START + 30) /* int 31 is reserved */ -#define NETX_IRQS 32 +#define NETX_IRQS (NETX_IRQ_VIC_START + 32) /* for multiplexed irqs on gpio 0..14 */ #define NETX_IRQ_GPIO(x) (NETX_IRQS + (x)) From 78305c863074f809f09a96ea97b86d0f0c1c8399 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 15 Feb 2013 13:30:58 +0100 Subject: [PATCH 02/15] ARM: 7652/1: mm: fix missing use of 'asid' to get asid value from mm->context.id Fix missing use of the asid macro when getting the ASID from the mm->context.id field. Signed-off-by: Ben Dooks Signed-off-by: Russell King --- arch/arm/mm/proc-v7-3level.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/proc-v7-3level.S b/arch/arm/mm/proc-v7-3level.S index 50bf1dafc9ea..6ffd78c0f9ab 100644 --- a/arch/arm/mm/proc-v7-3level.S +++ b/arch/arm/mm/proc-v7-3level.S @@ -48,7 +48,7 @@ ENTRY(cpu_v7_switch_mm) #ifdef CONFIG_MMU mmid r1, r1 @ get mm->context.id - and r3, r1, #0xff + asid r3, r1 mov r3, r3, lsl #(48 - 32) @ ASID mcrr p15, 0, r0, r3, c2 @ set TTB 0 isb From 904464b91eca8c665acea033489225af02eeb75a Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Fri, 22 Feb 2013 14:01:42 +0100 Subject: [PATCH 03/15] ARM: 7655/1: smp_twd: make twd_local_timer_of_register() no-op for nosmp When booting a SMP build kernel with nosmp on kernel cmdline, the following fat warning will be hit. ------------[ cut here ]------------ WARNING: at arch/arm/kernel/smp_twd.c:345 twd_local_timer_of_register+0x7c/0x90() twd_local_timer_of_register failed (-6) Modules linked in: Backtrace: [<80011f14>] (dump_backtrace+0x0/0x10c) from [<8044dd30>] (dump_stack+0x18/0x1c) r7:805e9f58 r6:805ba84c r5:80539331 r4:00000159 [<8044dd18>] (dump_stack+0x0/0x1c) from [<80020fbc>] (warn_slowpath_common+0x54/0x6c) [<80020f68>] (warn_slowpath_common+0x0/0x6c) from [<80021078>] (warn_slowpath_fmt+0x38/0x40) r9:412fc09a r8:8fffffff r7:ffffffff r6:00000001 r5:80633b8c r4:80b32da8 [<80021040>] (warn_slowpath_fmt+0x0/0x40) from [<805ba84] (twd_local_timer_of_register+0x7c/0x90) r3:fffffffa r2:8053934b [<805ba7d0>] (twd_local_timer_of_register+0x0/0x90) from [<805c0bec>] (imx6q_timer_init+0x18/0x4c) r5:80633800 r4:8053b701 [<805c0bd4>] (imx6q_timer_init+0x0/0x4c) from [<805ba4e8>] (time_init+0x28/0x38) r5:80633800 r4:805dc0f4 [<805ba4c0>] (time_init+0x0/0x38) from [<805b6854>] (start_kernel+0x1a0/0x310) [<805b66b4>] (start_kernel+0x0/0x310) from [<10008044>] (0x10008044) r8:1000406a r7:805f3f8c r6:805dc0c4 r5:805f0518 r4:10c5387d ---[ end trace 1b75b31a2719ed1c ]--- Check (!is_smp() || !setup_max_cpus) in twd_local_timer_of_register() to make it be a no-op for the conditions, thus avoid above warning. Reported-by: Dirk Behme Signed-off-by: Shawn Guo Signed-off-by: Russell King --- arch/arm/kernel/smp_twd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c index c092115d903a..3f2565037480 100644 --- a/arch/arm/kernel/smp_twd.c +++ b/arch/arm/kernel/smp_twd.c @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -373,6 +374,9 @@ void __init twd_local_timer_of_register(void) struct device_node *np; int err; + if (!is_smp() || !setup_max_cpus) + return; + np = of_find_matching_node(NULL, twd_of_match); if (!np) return; From d61947a164760ac520cb416768afdf38c33d60e7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 28 Feb 2013 17:46:16 +0100 Subject: [PATCH 04/15] ARM: 7657/1: head: fix swapper and idmap population with LPAE and big-endian The LPAE page table format uses 64-bit descriptors, so we need to take endianness into account when populating the swapper and idmap tables during early initialisation. This patch ensures that we store the two words making up each page table entry in the correct order when running big-endian. Cc: Acked-by: Catalin Marinas Tested-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/kernel/head.S | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 486a15ae9011..e0eb9a1cae77 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -184,13 +184,22 @@ __create_page_tables: orr r3, r3, #3 @ PGD block type mov r6, #4 @ PTRS_PER_PGD mov r7, #1 << (55 - 32) @ L_PGD_SWAPPER -1: str r3, [r0], #4 @ set bottom PGD entry bits +1: +#ifdef CONFIG_CPU_ENDIAN_BE8 str r7, [r0], #4 @ set top PGD entry bits + str r3, [r0], #4 @ set bottom PGD entry bits +#else + str r3, [r0], #4 @ set bottom PGD entry bits + str r7, [r0], #4 @ set top PGD entry bits +#endif add r3, r3, #0x1000 @ next PMD table subs r6, r6, #1 bne 1b add r4, r4, #0x1000 @ point to the PMD tables +#ifdef CONFIG_CPU_ENDIAN_BE8 + add r4, r4, #4 @ we only write the bottom word +#endif #endif ldr r7, [r10, #PROCINFO_MM_MMUFLAGS] @ mm_mmuflags @@ -258,6 +267,11 @@ __create_page_tables: addne r6, r6, #1 << SECTION_SHIFT strne r6, [r3] +#if defined(CONFIG_LPAE) && defined(CONFIG_CPU_ENDIAN_BE8) + sub r4, r4, #4 @ Fixup page table pointer + @ for 64-bit descriptors +#endif + #ifdef CONFIG_DEBUG_LL #if !defined(CONFIG_DEBUG_ICEDCC) && !defined(CONFIG_DEBUG_SEMIHOSTING) /* @@ -276,12 +290,16 @@ __create_page_tables: orr r3, r7, r3, lsl #SECTION_SHIFT #ifdef CONFIG_ARM_LPAE mov r7, #1 << (54 - 32) @ XN +#ifdef CONFIG_CPU_ENDIAN_BE8 + str r7, [r0], #4 + str r3, [r0], #4 +#else + str r3, [r0], #4 + str r7, [r0], #4 +#endif #else orr r3, r3, #PMD_SECT_XN -#endif str r3, [r0], #4 -#ifdef CONFIG_ARM_LPAE - str r7, [r0], #4 #endif #else /* CONFIG_DEBUG_ICEDCC || CONFIG_DEBUG_SEMIHOSTING */ From 37f47e3d62533c931b04cb409f2eb299e6342331 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 28 Feb 2013 17:47:20 +0100 Subject: [PATCH 05/15] ARM: 7658/1: mm: fix race updating mm->context.id on ASID rollover If a thread triggers an ASID rollover, other threads of the same process must be made to wait until the mm->context.id for the shared mm_struct has been updated to new generation and associated book-keeping (e.g. TLB invalidation) has ben performed. However, there is a *tiny* window where both mm->context.id and the relevant active_asids entry are updated to the new generation, but the TLB flush has not been performed, which could allow another thread to return to userspace with a dirty TLB, potentially leading to data corruption. In reality this will never occur because one CPU would need to perform a context-switch in the time it takes another to do a couple of atomic test/set operations but we should plug the race anyway. This patch moves the active_asids update until after the potential TLB flush on context-switch. Cc: # 3.8 Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/mm/context.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c index 7a0511191f6b..03ba181e359c 100644 --- a/arch/arm/mm/context.c +++ b/arch/arm/mm/context.c @@ -207,11 +207,11 @@ void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk) if ((mm->context.id ^ atomic64_read(&asid_generation)) >> ASID_BITS) new_context(mm, cpu); - atomic64_set(&per_cpu(active_asids, cpu), mm->context.id); - cpumask_set_cpu(cpu, mm_cpumask(mm)); - if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) local_flush_tlb_all(); + + atomic64_set(&per_cpu(active_asids, cpu), mm->context.id); + cpumask_set_cpu(cpu, mm_cpumask(mm)); raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: From 8a4e3a9ead7e37ce1505602b564c15da09ac039f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 28 Feb 2013 17:47:36 +0100 Subject: [PATCH 06/15] ARM: 7659/1: mm: make mm->context.id an atomic64_t variable mm->context.id is updated under asid_lock when a new ASID is allocated to an mm_struct. However, it is also read without the lock when a task is being scheduled and checking whether or not the current ASID generation is up-to-date. If two threads of the same process are being scheduled in parallel and the bottom bits of the generation in their mm->context.id match the current generation (that is, the mm_struct has not been used for ~2^24 rollovers) then the non-atomic, lockless access to mm->context.id may yield the incorrect ASID. This patch fixes this issue by making mm->context.id and atomic64_t, ensuring that the generation is always read consistently. For code that only requires access to the ASID bits (e.g. TLB flushing by mm), then the value is accessed directly, which GCC converts to an ldrb. Cc: # 3.8 Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/include/asm/mmu.h | 8 ++++---- arch/arm/include/asm/mmu_context.h | 2 +- arch/arm/kernel/asm-offsets.c | 2 +- arch/arm/mm/context.c | 21 +++++++++++++-------- 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h index 9f77e7804f3b..e3d55547e755 100644 --- a/arch/arm/include/asm/mmu.h +++ b/arch/arm/include/asm/mmu.h @@ -5,15 +5,15 @@ typedef struct { #ifdef CONFIG_CPU_HAS_ASID - u64 id; + atomic64_t id; #endif - unsigned int vmalloc_seq; + unsigned int vmalloc_seq; } mm_context_t; #ifdef CONFIG_CPU_HAS_ASID #define ASID_BITS 8 #define ASID_MASK ((~0ULL) << ASID_BITS) -#define ASID(mm) ((mm)->context.id & ~ASID_MASK) +#define ASID(mm) ((mm)->context.id.counter & ~ASID_MASK) #else #define ASID(mm) (0) #endif @@ -26,7 +26,7 @@ typedef struct { * modified for 2.6 by Hyok S. Choi */ typedef struct { - unsigned long end_brk; + unsigned long end_brk; } mm_context_t; #endif diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h index e1f644bc7cc5..863a6611323c 100644 --- a/arch/arm/include/asm/mmu_context.h +++ b/arch/arm/include/asm/mmu_context.h @@ -25,7 +25,7 @@ void __check_vmalloc_seq(struct mm_struct *mm); #ifdef CONFIG_CPU_HAS_ASID void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk); -#define init_new_context(tsk,mm) ({ mm->context.id = 0; }) +#define init_new_context(tsk,mm) ({ atomic64_set(&mm->context.id, 0); 0; }) #else /* !CONFIG_CPU_HAS_ASID */ diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 5ce738b43508..923eec7105cf 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -110,7 +110,7 @@ int main(void) BLANK(); #endif #ifdef CONFIG_CPU_HAS_ASID - DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id)); + DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); BLANK(); #endif DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c index 03ba181e359c..44d4ee52f3e2 100644 --- a/arch/arm/mm/context.c +++ b/arch/arm/mm/context.c @@ -152,9 +152,9 @@ static int is_reserved_asid(u64 asid) return 0; } -static void new_context(struct mm_struct *mm, unsigned int cpu) +static u64 new_context(struct mm_struct *mm, unsigned int cpu) { - u64 asid = mm->context.id; + u64 asid = atomic64_read(&mm->context.id); u64 generation = atomic64_read(&asid_generation); if (asid != 0 && is_reserved_asid(asid)) { @@ -181,13 +181,14 @@ static void new_context(struct mm_struct *mm, unsigned int cpu) cpumask_clear(mm_cpumask(mm)); } - mm->context.id = asid; + return asid; } void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk) { unsigned long flags; unsigned int cpu = smp_processor_id(); + u64 asid; if (unlikely(mm->context.vmalloc_seq != init_mm.context.vmalloc_seq)) __check_vmalloc_seq(mm); @@ -198,19 +199,23 @@ void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk) */ cpu_set_reserved_ttbr0(); - if (!((mm->context.id ^ atomic64_read(&asid_generation)) >> ASID_BITS) - && atomic64_xchg(&per_cpu(active_asids, cpu), mm->context.id)) + asid = atomic64_read(&mm->context.id); + if (!((asid ^ atomic64_read(&asid_generation)) >> ASID_BITS) + && atomic64_xchg(&per_cpu(active_asids, cpu), asid)) goto switch_mm_fastpath; raw_spin_lock_irqsave(&cpu_asid_lock, flags); /* Check that our ASID belongs to the current generation. */ - if ((mm->context.id ^ atomic64_read(&asid_generation)) >> ASID_BITS) - new_context(mm, cpu); + asid = atomic64_read(&mm->context.id); + if ((asid ^ atomic64_read(&asid_generation)) >> ASID_BITS) { + asid = new_context(mm, cpu); + atomic64_set(&mm->context.id, asid); + } if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) local_flush_tlb_all(); - atomic64_set(&per_cpu(active_asids, cpu), mm->context.id); + atomic64_set(&per_cpu(active_asids, cpu), asid); cpumask_set_cpu(cpu, mm_cpumask(mm)); raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); From 862c588f062fe9339a180cf6429e4df1855c376a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 28 Feb 2013 17:48:11 +0100 Subject: [PATCH 07/15] ARM: 7660/1: tlb: add branch predictor maintenance operations The ARM architecture requires explicit branch predictor maintenance when updating an instruction stream for a given virtual address. In reality, this isn't so much of a burden because the branch predictor is flushed during the cache maintenance required to make the new instructions visible to the I-side of the processor. However, there are still some cases where explicit flushing is required, so add a local_bp_flush_all operation to deal with this. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/include/asm/tlbflush.h | 34 +++++++++++++++++++++++++++------ arch/arm/kernel/smp_tlb.c | 12 ++++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h index 6e924d3a77eb..4db8c8820f0d 100644 --- a/arch/arm/include/asm/tlbflush.h +++ b/arch/arm/include/asm/tlbflush.h @@ -34,10 +34,13 @@ #define TLB_V6_D_ASID (1 << 17) #define TLB_V6_I_ASID (1 << 18) +#define TLB_V6_BP (1 << 19) + /* Unified Inner Shareable TLB operations (ARMv7 MP extensions) */ -#define TLB_V7_UIS_PAGE (1 << 19) -#define TLB_V7_UIS_FULL (1 << 20) -#define TLB_V7_UIS_ASID (1 << 21) +#define TLB_V7_UIS_PAGE (1 << 20) +#define TLB_V7_UIS_FULL (1 << 21) +#define TLB_V7_UIS_ASID (1 << 22) +#define TLB_V7_UIS_BP (1 << 23) #define TLB_BARRIER (1 << 28) #define TLB_L2CLEAN_FR (1 << 29) /* Feroceon */ @@ -150,7 +153,8 @@ #define v6wbi_tlb_flags (TLB_WB | TLB_DCLEAN | TLB_BARRIER | \ TLB_V6_I_FULL | TLB_V6_D_FULL | \ TLB_V6_I_PAGE | TLB_V6_D_PAGE | \ - TLB_V6_I_ASID | TLB_V6_D_ASID) + TLB_V6_I_ASID | TLB_V6_D_ASID | \ + TLB_V6_BP) #ifdef CONFIG_CPU_TLB_V6 # define v6wbi_possible_flags v6wbi_tlb_flags @@ -166,9 +170,11 @@ #endif #define v7wbi_tlb_flags_smp (TLB_WB | TLB_DCLEAN | TLB_BARRIER | \ - TLB_V7_UIS_FULL | TLB_V7_UIS_PAGE | TLB_V7_UIS_ASID) + TLB_V7_UIS_FULL | TLB_V7_UIS_PAGE | \ + TLB_V7_UIS_ASID | TLB_V7_UIS_BP) #define v7wbi_tlb_flags_up (TLB_WB | TLB_DCLEAN | TLB_BARRIER | \ - TLB_V6_U_FULL | TLB_V6_U_PAGE | TLB_V6_U_ASID) + TLB_V6_U_FULL | TLB_V6_U_PAGE | \ + TLB_V6_U_ASID | TLB_V6_BP) #ifdef CONFIG_CPU_TLB_V7 @@ -430,6 +436,20 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr) } } +static inline void local_flush_bp_all(void) +{ + const int zero = 0; + const unsigned int __tlb_flag = __cpu_tlb_flags; + + if (tlb_flag(TLB_V7_UIS_BP)) + asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero)); + else if (tlb_flag(TLB_V6_BP)) + asm("mcr p15, 0, %0, c7, c5, 6" : : "r" (zero)); + + if (tlb_flag(TLB_BARRIER)) + isb(); +} + /* * flush_pmd_entry * @@ -480,6 +500,7 @@ static inline void clean_pmd_entry(void *pmd) #define flush_tlb_kernel_page local_flush_tlb_kernel_page #define flush_tlb_range local_flush_tlb_range #define flush_tlb_kernel_range local_flush_tlb_kernel_range +#define flush_bp_all local_flush_bp_all #else extern void flush_tlb_all(void); extern void flush_tlb_mm(struct mm_struct *mm); @@ -487,6 +508,7 @@ extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr); extern void flush_tlb_kernel_page(unsigned long kaddr); extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); +extern void flush_bp_all(void); #endif /* diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c index 02c5d2ce23bf..bd0300531399 100644 --- a/arch/arm/kernel/smp_tlb.c +++ b/arch/arm/kernel/smp_tlb.c @@ -64,6 +64,11 @@ static inline void ipi_flush_tlb_kernel_range(void *arg) local_flush_tlb_kernel_range(ta->ta_start, ta->ta_end); } +static inline void ipi_flush_bp_all(void *ignored) +{ + local_flush_bp_all(); +} + void flush_tlb_all(void) { if (tlb_ops_need_broadcast()) @@ -127,3 +132,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) local_flush_tlb_kernel_range(start, end); } +void flush_bp_all(void) +{ + if (tlb_ops_need_broadcast()) + on_each_cpu(ipi_flush_bp_all, NULL, 1); + else + local_flush_bp_all(); +} From 89c7e4b8bbb3d4fa52df5746a8ad38e610143651 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 28 Feb 2013 17:48:40 +0100 Subject: [PATCH 08/15] ARM: 7661/1: mm: perform explicit branch predictor maintenance when required The ARM ARM requires branch predictor maintenance if, for a given ASID, the instructions at a specific virtual address appear to change. From the kernel's point of view, that means: - Changing the kernel's view of memory (e.g. switching to the identity map) - ASID rollover (since ASIDs will be re-allocated to new tasks) This patch adds explicit branch predictor maintenance when either of the two conditions above are met. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/kernel/smp.c | 1 + arch/arm/kernel/suspend.c | 1 + arch/arm/mm/context.c | 4 +++- arch/arm/mm/idmap.c | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 1bdfd87c8e41..31644f1978d5 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -285,6 +285,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void) * switch away from it before attempting any exclusive accesses. */ cpu_switch_mm(mm->pgd, mm); + local_flush_bp_all(); enter_lazy_tlb(mm, current); local_flush_tlb_all(); diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c index 358bca3a995e..c59c97ea8268 100644 --- a/arch/arm/kernel/suspend.c +++ b/arch/arm/kernel/suspend.c @@ -68,6 +68,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) ret = __cpu_suspend(arg, fn); if (ret == 0) { cpu_switch_mm(mm->pgd, mm); + local_flush_bp_all(); local_flush_tlb_all(); } diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c index 44d4ee52f3e2..a5a4b2bc42ba 100644 --- a/arch/arm/mm/context.c +++ b/arch/arm/mm/context.c @@ -212,8 +212,10 @@ void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk) atomic64_set(&mm->context.id, asid); } - if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) + if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) { + local_flush_bp_all(); local_flush_tlb_all(); + } atomic64_set(&per_cpu(active_asids, cpu), asid); cpumask_set_cpu(cpu, mm_cpumask(mm)); diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c index 2dffc010cc41..5ee505c937d1 100644 --- a/arch/arm/mm/idmap.c +++ b/arch/arm/mm/idmap.c @@ -141,6 +141,7 @@ void setup_mm_for_reboot(void) { /* Switch to the identity mapping. */ cpu_switch_mm(idmap_pgd, &init_mm); + local_flush_bp_all(); #ifdef CONFIG_CPU_HAS_ASID /* From 1a8e611874da714ee7ef1e92e5160b38dc54959b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 28 Feb 2013 17:48:57 +0100 Subject: [PATCH 09/15] ARM: 7662/1: hw_breakpoint: reset debug logic on secondary CPUs in s2ram resume We must mask out the CPU_TASKS_FROZEN bit so that reset_ctrl_regs is also called on a secondary CPU during s2ram resume, where only the boot CPU will receive the PM_EXIT notification. Signed-off-by: Dietmar Eggemann Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/kernel/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 5eae53e7a2e1..96093b75ab90 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -1023,7 +1023,7 @@ out_mdbgen: static int __cpuinit dbg_reset_notify(struct notifier_block *self, unsigned long action, void *cpu) { - if (action == CPU_ONLINE) + if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE) smp_call_function_single((int)cpu, reset_ctrl_regs, NULL, 1); return NOTIFY_OK; From f2fe09b055e2549de41fb107b34c60bac4a1b0cf Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 28 Feb 2013 17:49:11 +0100 Subject: [PATCH 10/15] ARM: 7663/1: perf: fix ARMv7 EVTYPE_MASK to include NSH bit Masked out PMXEVTYPER.NSH means that we can't enable profiling at PL2, regardless of the settings in the HDCR. This patch fixes the broken mask. Cc: Reported-by: Christoffer Dall Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/kernel/perf_event_v7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index 8c79a9e70b83..039cffb053a7 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -774,7 +774,7 @@ static const unsigned armv7_a7_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] /* * PMXEVTYPER: Event selection reg */ -#define ARMV7_EVTYPE_MASK 0xc00000ff /* Mask for writable bits */ +#define ARMV7_EVTYPE_MASK 0xc80000ff /* Mask for writable bits */ #define ARMV7_EVTYPE_EVENT 0xff /* Mask for EVENT bits */ /* From e595ede6050b1ce982d74f7084f93715bcc32359 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 28 Feb 2013 17:51:29 +0100 Subject: [PATCH 11/15] ARM: 7664/1: perf: remove erroneous semicolon from event initialisation Commit 9dcbf466559f ("ARM: perf: simplify __hw_perf_event_init err handling") tidied up the error handling code for perf event initialisation on ARM, but a copy-and-paste error left a dangling semicolon at the end of an if statement. This patch removes the broken semicolon, restoring the old group validation semantics. Cc: Mark Rutland Acked-by: Dirk Behme Signed-off-by: Chen Gang Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 31e0eb353cd8..a89206799db8 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -400,7 +400,7 @@ __hw_perf_event_init(struct perf_event *event) } if (event->group_leader != event) { - if (validate_group(event) != 0); + if (validate_group(event) != 0) return -EINVAL; } From 3f7d1fe108dbaefd0c57a41753fc2c90b395f458 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 28 Feb 2013 21:53:35 +0100 Subject: [PATCH 12/15] ARM: 7665/1: Wire up kcmp syscall Wire up kcmp syscall for ability to proceed checkpoint/restore procedure on ARM platform. Signed-off-by: Alexander Kartashov Signed-off-by: Cyrill Gorcunov Acked-by: Arnd Bergmann Signed-off-by: Russell King --- arch/arm/include/uapi/asm/unistd.h | 2 +- arch/arm/kernel/calls.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h index 4da7cde70b5d..af33b44990ed 100644 --- a/arch/arm/include/uapi/asm/unistd.h +++ b/arch/arm/include/uapi/asm/unistd.h @@ -404,7 +404,7 @@ #define __NR_setns (__NR_SYSCALL_BASE+375) #define __NR_process_vm_readv (__NR_SYSCALL_BASE+376) #define __NR_process_vm_writev (__NR_SYSCALL_BASE+377) - /* 378 for kcmp */ +#define __NR_kcmp (__NR_SYSCALL_BASE+378) #define __NR_finit_module (__NR_SYSCALL_BASE+379) /* diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index 0cc57611fc4f..c6ca7e376773 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -387,7 +387,7 @@ /* 375 */ CALL(sys_setns) CALL(sys_process_vm_readv) CALL(sys_process_vm_writev) - CALL(sys_ni_syscall) /* reserved for sys_kcmp */ + CALL(sys_kcmp) CALL(sys_finit_module) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls From b8083f86e8e9b23ad4d75d334c92b601947dfa91 Mon Sep 17 00:00:00 2001 From: Jonathan Austin Date: Mon, 4 Mar 2013 15:17:36 +0100 Subject: [PATCH 13/15] ARM: 7666/1: decompressor: add -mno-single-pic-base for building the decompressor Before jumping to (position independent) C-code from the decompressor's assembler world we set-up the C environment. This setup currently does not set r9, which for arm-none-uclinux-uclibceabi toolchains is by default expected to be the PIC offset base register (IE should point to the beginning of the GOT). Currently, therefore, in order to build working kernels that use the decompressor it is necessary to use an arm-linux-gnueabi toolchain, or similar. uClinux toolchains cause a prefetch abort to occur at the beginning of the decompress_kernel function. This patch allows uClinux toolchains to build bootable zImages by forcing the -mno-single-pic-base option, which ensures that the location of the GOT is re-derived each time it is required, and r9 becomes free for use as a general purpose register. This has a small (4% in instruction terms) advantage over the alternative of setting r9 to point to the GOT before calling into the C-world. Signed-off-by: Jonathan Austin Acked-by: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/boot/compressed/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 5cad8a6dadb0..afed28e37ea5 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -120,7 +120,7 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) endif -ccflags-y := -fpic -fno-builtin -I$(obj) +ccflags-y := -fpic -mno-single-pic-base -fno-builtin -I$(obj) asflags-y := -Wa,-march=all -DZIMAGE # Supply kernel BSS size to the decompressor via a linker symbol. From 44d6b1fc3e3c6a3af8e599b724972e881c81e1c9 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 5 Mar 2013 03:54:06 +0100 Subject: [PATCH 14/15] ARM: 7667/1: perf: Fix section mismatch on armpmu_init() WARNING: vmlinux.o(.text+0xfb80): Section mismatch in reference from the function armpmu_register() to the function .init.text:armpmu_init() The function armpmu_register() references the function __init armpmu_init(). This is often because armpmu_register lacks a __init annotation or the annotation of armpmu_init is wrong. Just drop the __init marking on armpmu_init() because armpmu_register() no longer has an __init marking. Acked-by: Will Deacon Signed-off-by: Stephen Boyd Signed-off-by: Russell King --- arch/arm/kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index a89206799db8..146157dfe27c 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -484,7 +484,7 @@ const struct dev_pm_ops armpmu_dev_pm_ops = { SET_RUNTIME_PM_OPS(armpmu_runtime_suspend, armpmu_runtime_resume, NULL) }; -static void __init armpmu_init(struct arm_pmu *armpmu) +static void armpmu_init(struct arm_pmu *armpmu) { atomic_set(&armpmu->active_events, 0); mutex_init(&armpmu->reserve_mutex); From 455bd4c430b0c0a361f38e8658a0d6cb469942b5 Mon Sep 17 00:00:00 2001 From: Ivan Djelic Date: Wed, 6 Mar 2013 20:09:27 +0100 Subject: [PATCH 15/15] ARM: 7668/1: fix memset-related crashes caused by recent GCC (4.7.2) optimizations Recent GCC versions (e.g. GCC-4.7.2) perform optimizations based on assumptions about the implementation of memset and similar functions. The current ARM optimized memset code does not return the value of its first argument, as is usually expected from standard implementations. For instance in the following function: void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) { memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); waiter->magic = waiter; INIT_LIST_HEAD(&waiter->list); } compiled as: 800554d0 : 800554d0: e92d4008 push {r3, lr} 800554d4: e1a00001 mov r0, r1 800554d8: e3a02010 mov r2, #16 ; 0x10 800554dc: e3a01011 mov r1, #17 ; 0x11 800554e0: eb04426e bl 80165ea0 800554e4: e1a03000 mov r3, r0 800554e8: e583000c str r0, [r3, #12] 800554ec: e5830000 str r0, [r3] 800554f0: e5830004 str r0, [r3, #4] 800554f4: e8bd8008 pop {r3, pc} GCC assumes memset returns the value of pointer 'waiter' in register r0; causing register/memory corruptions. This patch fixes the return value of the assembly version of memset. It adds a 'mov' instruction and merges an additional load+store into existing load/store instructions. For ease of review, here is a breakdown of the patch into 4 simple steps: Step 1 ====== Perform the following substitutions: ip -> r8, then r0 -> ip, and insert 'mov ip, r0' as the first statement of the function. At this point, we have a memset() implementation returning the proper result, but corrupting r8 on some paths (the ones that were using ip). Step 2 ====== Make sure r8 is saved and restored when (! CALGN(1)+0) == 1: save r8: - str lr, [sp, #-4]! + stmfd sp!, {r8, lr} and restore r8 on both exit paths: - ldmeqfd sp!, {pc} @ Now <64 bytes to go. + ldmeqfd sp!, {r8, pc} @ Now <64 bytes to go. (...) tst r2, #16 stmneia ip!, {r1, r3, r8, lr} - ldr lr, [sp], #4 + ldmfd sp!, {r8, lr} Step 3 ====== Make sure r8 is saved and restored when (! CALGN(1)+0) == 0: save r8: - stmfd sp!, {r4-r7, lr} + stmfd sp!, {r4-r8, lr} and restore r8 on both exit paths: bgt 3b - ldmeqfd sp!, {r4-r7, pc} + ldmeqfd sp!, {r4-r8, pc} (...) tst r2, #16 stmneia ip!, {r4-r7} - ldmfd sp!, {r4-r7, lr} + ldmfd sp!, {r4-r8, lr} Step 4 ====== Rewrite register list "r4-r7, r8" as "r4-r8". Signed-off-by: Ivan Djelic Reviewed-by: Nicolas Pitre Signed-off-by: Dirk Behme Signed-off-by: Russell King --- arch/arm/lib/memset.S | 85 ++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index 650d5923ab83..d912e7397ecc 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S @@ -19,9 +19,9 @@ 1: subs r2, r2, #4 @ 1 do we have enough blt 5f @ 1 bytes to align with? cmp r3, #2 @ 1 - strltb r1, [r0], #1 @ 1 - strleb r1, [r0], #1 @ 1 - strb r1, [r0], #1 @ 1 + strltb r1, [ip], #1 @ 1 + strleb r1, [ip], #1 @ 1 + strb r1, [ip], #1 @ 1 add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) /* * The pointer is now aligned and the length is adjusted. Try doing the @@ -29,10 +29,14 @@ */ ENTRY(memset) - ands r3, r0, #3 @ 1 unaligned? +/* + * Preserve the contents of r0 for the return value. + */ + mov ip, r0 + ands r3, ip, #3 @ 1 unaligned? bne 1b @ 1 /* - * we know that the pointer in r0 is aligned to a word boundary. + * we know that the pointer in ip is aligned to a word boundary. */ orr r1, r1, r1, lsl #8 orr r1, r1, r1, lsl #16 @@ -43,29 +47,28 @@ ENTRY(memset) #if ! CALGN(1)+0 /* - * We need an extra register for this loop - save the return address and - * use the LR + * We need 2 extra registers for this loop - use r8 and the LR */ - str lr, [sp, #-4]! - mov ip, r1 + stmfd sp!, {r8, lr} + mov r8, r1 mov lr, r1 2: subs r2, r2, #64 - stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. - stmgeia r0!, {r1, r3, ip, lr} - stmgeia r0!, {r1, r3, ip, lr} - stmgeia r0!, {r1, r3, ip, lr} + stmgeia ip!, {r1, r3, r8, lr} @ 64 bytes at a time. + stmgeia ip!, {r1, r3, r8, lr} + stmgeia ip!, {r1, r3, r8, lr} + stmgeia ip!, {r1, r3, r8, lr} bgt 2b - ldmeqfd sp!, {pc} @ Now <64 bytes to go. + ldmeqfd sp!, {r8, pc} @ Now <64 bytes to go. /* * No need to correct the count; we're only testing bits from now on */ tst r2, #32 - stmneia r0!, {r1, r3, ip, lr} - stmneia r0!, {r1, r3, ip, lr} + stmneia ip!, {r1, r3, r8, lr} + stmneia ip!, {r1, r3, r8, lr} tst r2, #16 - stmneia r0!, {r1, r3, ip, lr} - ldr lr, [sp], #4 + stmneia ip!, {r1, r3, r8, lr} + ldmfd sp!, {r8, lr} #else @@ -74,54 +77,54 @@ ENTRY(memset) * whole cache lines at once. */ - stmfd sp!, {r4-r7, lr} + stmfd sp!, {r4-r8, lr} mov r4, r1 mov r5, r1 mov r6, r1 mov r7, r1 - mov ip, r1 + mov r8, r1 mov lr, r1 cmp r2, #96 - tstgt r0, #31 + tstgt ip, #31 ble 3f - and ip, r0, #31 - rsb ip, ip, #32 - sub r2, r2, ip - movs ip, ip, lsl #(32 - 4) - stmcsia r0!, {r4, r5, r6, r7} - stmmiia r0!, {r4, r5} - tst ip, #(1 << 30) - mov ip, r1 - strne r1, [r0], #4 + and r8, ip, #31 + rsb r8, r8, #32 + sub r2, r2, r8 + movs r8, r8, lsl #(32 - 4) + stmcsia ip!, {r4, r5, r6, r7} + stmmiia ip!, {r4, r5} + tst r8, #(1 << 30) + mov r8, r1 + strne r1, [ip], #4 3: subs r2, r2, #64 - stmgeia r0!, {r1, r3-r7, ip, lr} - stmgeia r0!, {r1, r3-r7, ip, lr} + stmgeia ip!, {r1, r3-r8, lr} + stmgeia ip!, {r1, r3-r8, lr} bgt 3b - ldmeqfd sp!, {r4-r7, pc} + ldmeqfd sp!, {r4-r8, pc} tst r2, #32 - stmneia r0!, {r1, r3-r7, ip, lr} + stmneia ip!, {r1, r3-r8, lr} tst r2, #16 - stmneia r0!, {r4-r7} - ldmfd sp!, {r4-r7, lr} + stmneia ip!, {r4-r7} + ldmfd sp!, {r4-r8, lr} #endif 4: tst r2, #8 - stmneia r0!, {r1, r3} + stmneia ip!, {r1, r3} tst r2, #4 - strne r1, [r0], #4 + strne r1, [ip], #4 /* * When we get here, we've got less than 4 bytes to zero. We * may have an unaligned pointer as well. */ 5: tst r2, #2 - strneb r1, [r0], #1 - strneb r1, [r0], #1 + strneb r1, [ip], #1 + strneb r1, [ip], #1 tst r2, #1 - strneb r1, [r0], #1 + strneb r1, [ip], #1 mov pc, lr ENDPROC(memset)