From 0eb948dd7f7c3cec37440c16a6c738c8e75efcda Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 19 Nov 2009 11:12:15 +0000 Subject: [PATCH 1/2] ARM: cache-l2x0: avoid taking spinlock for every iteration Taking the spinlock for every iteration is very expensive; instead, batch iterations up into 4K blocks, releasing and reacquiring the spinlock between each block. Signed-off-by: Russell King Acked-by: Catalin Marinas --- arch/arm/mm/cache-l2x0.c | 65 ++++++++++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 13 deletions(-) diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c index b480f1d3591f..c1b7bfff47f4 100644 --- a/arch/arm/mm/cache-l2x0.c +++ b/arch/arm/mm/cache-l2x0.c @@ -31,14 +31,10 @@ static DEFINE_SPINLOCK(l2x0_lock); static inline void sync_writel(unsigned long val, unsigned long reg, unsigned long complete_mask) { - unsigned long flags; - - spin_lock_irqsave(&l2x0_lock, flags); writel(val, l2x0_base + reg); /* wait for the operation to complete */ while (readl(l2x0_base + reg) & complete_mask) ; - spin_unlock_irqrestore(&l2x0_lock, flags); } static inline void cache_sync(void) @@ -48,15 +44,20 @@ static inline void cache_sync(void) static inline void l2x0_inv_all(void) { + unsigned long flags; + /* invalidate all ways */ + spin_lock_irqsave(&l2x0_lock, flags); sync_writel(0xff, L2X0_INV_WAY, 0xff); cache_sync(); + spin_unlock_irqrestore(&l2x0_lock, flags); } static void l2x0_inv_range(unsigned long start, unsigned long end) { - unsigned long addr; + unsigned long flags; + spin_lock_irqsave(&l2x0_lock, flags); if (start & (CACHE_LINE_SIZE - 1)) { start &= ~(CACHE_LINE_SIZE - 1); sync_writel(start, L2X0_CLEAN_INV_LINE_PA, 1); @@ -68,29 +69,67 @@ static void l2x0_inv_range(unsigned long start, unsigned long end) sync_writel(end, L2X0_CLEAN_INV_LINE_PA, 1); } - for (addr = start; addr < end; addr += CACHE_LINE_SIZE) - sync_writel(addr, L2X0_INV_LINE_PA, 1); + while (start < end) { + unsigned long blk_end = start + min(end - start, 4096UL); + + while (start < blk_end) { + sync_writel(start, L2X0_INV_LINE_PA, 1); + start += CACHE_LINE_SIZE; + } + + if (blk_end < end) { + spin_unlock_irqrestore(&l2x0_lock, flags); + spin_lock_irqsave(&l2x0_lock, flags); + } + } cache_sync(); + spin_unlock_irqrestore(&l2x0_lock, flags); } static void l2x0_clean_range(unsigned long start, unsigned long end) { - unsigned long addr; + unsigned long flags; + spin_lock_irqsave(&l2x0_lock, flags); start &= ~(CACHE_LINE_SIZE - 1); - for (addr = start; addr < end; addr += CACHE_LINE_SIZE) - sync_writel(addr, L2X0_CLEAN_LINE_PA, 1); + while (start < end) { + unsigned long blk_end = start + min(end - start, 4096UL); + + while (start < blk_end) { + sync_writel(start, L2X0_CLEAN_LINE_PA, 1); + start += CACHE_LINE_SIZE; + } + + if (blk_end < end) { + spin_unlock_irqrestore(&l2x0_lock, flags); + spin_lock_irqsave(&l2x0_lock, flags); + } + } cache_sync(); + spin_unlock_irqrestore(&l2x0_lock, flags); } static void l2x0_flush_range(unsigned long start, unsigned long end) { - unsigned long addr; + unsigned long flags; + spin_lock_irqsave(&l2x0_lock, flags); start &= ~(CACHE_LINE_SIZE - 1); - for (addr = start; addr < end; addr += CACHE_LINE_SIZE) - sync_writel(addr, L2X0_CLEAN_INV_LINE_PA, 1); + while (start < end) { + unsigned long blk_end = start + min(end - start, 4096UL); + + while (start < blk_end) { + sync_writel(start, L2X0_CLEAN_INV_LINE_PA, 1); + start += CACHE_LINE_SIZE; + } + + if (blk_end < end) { + spin_unlock_irqrestore(&l2x0_lock, flags); + spin_lock_irqsave(&l2x0_lock, flags); + } + } cache_sync(); + spin_unlock_irqrestore(&l2x0_lock, flags); } void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask) From 3d1074349b22c9653e746282564136c87668c2b8 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 19 Nov 2009 11:41:09 +0000 Subject: [PATCH 2/2] ARM: cache-l2x0: make better use of background cache handling There's no point having the hardware support background operations if we issue a cache operation, and then wait for it to complete before calculating the address of the next operation. We gain no advantage in the cache controller stalling the bus until completion. What we should be doing is using the 'wait' time productively by calculating the address of the next operation, and only then waiting for the previous operation to complete. This means that cache operations can occur in parallel with the CPU calculating the next address. Signed-off-by: Russell King Acked-by: Catalin Marinas --- arch/arm/mm/cache-l2x0.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c index c1b7bfff47f4..ec85dda1e733 100644 --- a/arch/arm/mm/cache-l2x0.c +++ b/arch/arm/mm/cache-l2x0.c @@ -28,18 +28,18 @@ static void __iomem *l2x0_base; static DEFINE_SPINLOCK(l2x0_lock); -static inline void sync_writel(unsigned long val, unsigned long reg, - unsigned long complete_mask) +static inline void cache_wait(void __iomem *reg, unsigned long mask) { - writel(val, l2x0_base + reg); /* wait for the operation to complete */ - while (readl(l2x0_base + reg) & complete_mask) + while (readl(reg) & mask) ; } static inline void cache_sync(void) { - sync_writel(0, L2X0_CACHE_SYNC, 1); + void __iomem *base = l2x0_base; + writel(0, base + L2X0_CACHE_SYNC); + cache_wait(base + L2X0_CACHE_SYNC, 1); } static inline void l2x0_inv_all(void) @@ -48,32 +48,37 @@ static inline void l2x0_inv_all(void) /* invalidate all ways */ spin_lock_irqsave(&l2x0_lock, flags); - sync_writel(0xff, L2X0_INV_WAY, 0xff); + writel(0xff, l2x0_base + L2X0_INV_WAY); + cache_wait(l2x0_base + L2X0_INV_WAY, 0xff); cache_sync(); spin_unlock_irqrestore(&l2x0_lock, flags); } static void l2x0_inv_range(unsigned long start, unsigned long end) { + void __iomem *base = l2x0_base; unsigned long flags; spin_lock_irqsave(&l2x0_lock, flags); if (start & (CACHE_LINE_SIZE - 1)) { start &= ~(CACHE_LINE_SIZE - 1); - sync_writel(start, L2X0_CLEAN_INV_LINE_PA, 1); + cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1); + writel(start, base + L2X0_CLEAN_INV_LINE_PA); start += CACHE_LINE_SIZE; } if (end & (CACHE_LINE_SIZE - 1)) { end &= ~(CACHE_LINE_SIZE - 1); - sync_writel(end, L2X0_CLEAN_INV_LINE_PA, 1); + cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1); + writel(end, base + L2X0_CLEAN_INV_LINE_PA); } while (start < end) { unsigned long blk_end = start + min(end - start, 4096UL); while (start < blk_end) { - sync_writel(start, L2X0_INV_LINE_PA, 1); + cache_wait(base + L2X0_INV_LINE_PA, 1); + writel(start, base + L2X0_INV_LINE_PA); start += CACHE_LINE_SIZE; } @@ -82,12 +87,14 @@ static void l2x0_inv_range(unsigned long start, unsigned long end) spin_lock_irqsave(&l2x0_lock, flags); } } + cache_wait(base + L2X0_INV_LINE_PA, 1); cache_sync(); spin_unlock_irqrestore(&l2x0_lock, flags); } static void l2x0_clean_range(unsigned long start, unsigned long end) { + void __iomem *base = l2x0_base; unsigned long flags; spin_lock_irqsave(&l2x0_lock, flags); @@ -96,7 +103,8 @@ static void l2x0_clean_range(unsigned long start, unsigned long end) unsigned long blk_end = start + min(end - start, 4096UL); while (start < blk_end) { - sync_writel(start, L2X0_CLEAN_LINE_PA, 1); + cache_wait(base + L2X0_CLEAN_LINE_PA, 1); + writel(start, base + L2X0_CLEAN_LINE_PA); start += CACHE_LINE_SIZE; } @@ -105,12 +113,14 @@ static void l2x0_clean_range(unsigned long start, unsigned long end) spin_lock_irqsave(&l2x0_lock, flags); } } + cache_wait(base + L2X0_CLEAN_LINE_PA, 1); cache_sync(); spin_unlock_irqrestore(&l2x0_lock, flags); } static void l2x0_flush_range(unsigned long start, unsigned long end) { + void __iomem *base = l2x0_base; unsigned long flags; spin_lock_irqsave(&l2x0_lock, flags); @@ -119,7 +129,8 @@ static void l2x0_flush_range(unsigned long start, unsigned long end) unsigned long blk_end = start + min(end - start, 4096UL); while (start < blk_end) { - sync_writel(start, L2X0_CLEAN_INV_LINE_PA, 1); + cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1); + writel(start, base + L2X0_CLEAN_INV_LINE_PA); start += CACHE_LINE_SIZE; } @@ -128,6 +139,7 @@ static void l2x0_flush_range(unsigned long start, unsigned long end) spin_lock_irqsave(&l2x0_lock, flags); } } + cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1); cache_sync(); spin_unlock_irqrestore(&l2x0_lock, flags); }