diff --git a/Documentation/devicetree/bindings/arc/archs-pct.txt b/Documentation/devicetree/bindings/arc/archs-pct.txt new file mode 100644 index 000000000000..1ae98b87c640 --- /dev/null +++ b/Documentation/devicetree/bindings/arc/archs-pct.txt @@ -0,0 +1,17 @@ +* ARC HS Performance Counters + +The ARC HS can be configured with a pipeline performance monitor for counting +CPU and cache events like cache misses and hits. Like conventional PCT there +are 100+ hardware conditions dynamically mapped to upto 32 counters. +It also supports overflow interrupts. + +Required properties: + +- compatible : should contain + "snps,archs-pct" + +Example: + +pmu { + compatible = "snps,archs-pct"; +}; diff --git a/MAINTAINERS b/MAINTAINERS index 963233fc721b..4be50b79914e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9911,8 +9911,9 @@ SYNOPSYS ARC ARCHITECTURE M: Vineet Gupta S: Supported F: arch/arc/ -F: Documentation/devicetree/bindings/arc/ +F: Documentation/devicetree/bindings/arc/* F: drivers/tty/serial/arc_uart.c +T: git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git SYNOPSYS ARC SDP platform support M: Alexey Brodkin diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index bd4670d1b89b..78c0621d5819 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -8,6 +8,7 @@ config ARC def_bool y + select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC select BUILDTIME_EXTABLE_SORT select COMMON_CLK select CLONE_BACKWARDS @@ -22,6 +23,7 @@ config ARC select GENERIC_SMP_IDLE_THREAD select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK + select HAVE_FUTEX_CMPXCHG select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_KRETPROBES diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi index 1cd5e82f5dc2..846481f37eef 100644 --- a/arch/arc/boot/dts/axc003.dtsi +++ b/arch/arc/boot/dts/axc003.dtsi @@ -72,12 +72,13 @@ }; /* - * This INTC is actually connected to DW APB GPIO - * which acts as a wire between MB INTC and CPU INTC. - * GPIO INTC is configured in platform init code - * and here we mimic direct connection from MB INTC to - * CPU INTC, thus we set "interrupts = <7>" instead of - * "interrupts = <12>" + * The DW APB ICTL intc on MB is connected to CPU intc via a + * DT "invisible" DW APB GPIO block, configured to simply pass thru + * interrupts - setup accordinly in platform init (plat-axs10x/ax10x.c) + * + * So here we mimic a direct connection betwen them, ignoring the + * ABPG GPIO. Thus set "interrupts = <24>" (DW APB GPIO to core) + * instead of "interrupts = <12>" (DW APB ICTL to DW APB GPIO) * * This intc actually resides on MB, but we move it here to * avoid duplicating the MB dtsi file given that IRQ from diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index c8f57b8449dc..d8023bc8d1ad 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -35,6 +35,7 @@ #define ARC_REG_RTT_BCR 0xF2 #define ARC_REG_IRQ_BCR 0xF3 #define ARC_REG_SMART_BCR 0xFF +#define ARC_REG_CLUSTER_BCR 0xcf /* status32 Bits Positions */ #define STATUS_AE_BIT 5 /* Exception active */ diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index d67345d3e2d4..e23ea6e7633a 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -53,6 +53,8 @@ extern void arc_cache_init(void); extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len); extern void read_decode_cache_bcr(void); +extern int ioc_exists; + #endif /* !__ASSEMBLY__ */ /* Instruction cache related Auxiliary registers */ @@ -94,4 +96,10 @@ extern void read_decode_cache_bcr(void); #define SLC_CTRL_BUSY 0x100 #define SLC_CTRL_RGN_OP_INV 0x200 +/* IO coherency related Auxiliary registers */ +#define ARC_REG_IO_COH_ENABLE 0x500 +#define ARC_REG_IO_COH_PARTIAL 0x501 +#define ARC_REG_IO_COH_AP0_BASE 0x508 +#define ARC_REG_IO_COH_AP0_SIZE 0x509 + #endif /* _ASM_CACHE_H */ diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h index 44fd531f4d7b..af7a2db139c9 100644 --- a/arch/arc/include/asm/cmpxchg.h +++ b/arch/arc/include/asm/cmpxchg.h @@ -110,18 +110,18 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr, sizeof(*(ptr)))) /* - * On ARC700, EX insn is inherently atomic, so by default "vanilla" xchg() need - * not require any locking. However there's a quirk. - * ARC lacks native CMPXCHG, thus emulated (see above), using external locking - - * incidently it "reuses" the same atomic_ops_lock used by atomic APIs. - * Now, llist code uses cmpxchg() and xchg() on same data, so xchg() needs to - * abide by same serializing rules, thus ends up using atomic_ops_lock as well. + * xchg() maps directly to ARC EX instruction which guarantees atomicity. + * However in !LLSC config, it also needs to be use @atomic_ops_lock spinlock + * due to a subtle reason: + * - For !LLSC, cmpxchg() needs to use that lock (see above) and there is lot + * of kernel code which calls xchg()/cmpxchg() on same data (see llist.h) + * Hence xchg() needs to follow same locking rules. * - * This however is only relevant if SMP and/or ARC lacks LLSC - * if (UP or LLSC) - * xchg doesn't need serialization - * else <==> !(UP or LLSC) <==> (!UP and !LLSC) <==> (SMP and !LLSC) - * xchg needs serialization + * Technically the lock is also needed for UP (boils down to irq save/restore) + * but we can cheat a bit since cmpxchg() atomic_ops_lock() would cause irqs to + * be disabled thus can't possibly be interrpted/preempted/clobbered by xchg() + * Other way around, xchg is one instruction anyways, so can't be interrupted + * as such */ #if !defined(CONFIG_ARC_HAS_LLSC) && defined(CONFIG_SMP) diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 70cfe16b742d..11e1b1f3acda 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -20,6 +20,7 @@ #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)\ \ + smp_mb(); \ __asm__ __volatile__( \ "1: llock %1, [%2] \n" \ insn "\n" \ @@ -30,7 +31,7 @@ " .section .fixup,\"ax\" \n" \ " .align 4 \n" \ "4: mov %0, %4 \n" \ - " b 3b \n" \ + " j 3b \n" \ " .previous \n" \ " .section __ex_table,\"a\" \n" \ " .align 4 \n" \ @@ -40,12 +41,14 @@ \ : "=&r" (ret), "=&r" (oldval) \ : "r" (uaddr), "r" (oparg), "ir" (-EFAULT) \ - : "cc", "memory") + : "cc", "memory"); \ + smp_mb() \ #else /* !CONFIG_ARC_HAS_LLSC */ #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)\ \ + smp_mb(); \ __asm__ __volatile__( \ "1: ld %1, [%2] \n" \ insn "\n" \ @@ -55,7 +58,7 @@ " .section .fixup,\"ax\" \n" \ " .align 4 \n" \ "4: mov %0, %4 \n" \ - " b 3b \n" \ + " j 3b \n" \ " .previous \n" \ " .section __ex_table,\"a\" \n" \ " .align 4 \n" \ @@ -65,7 +68,8 @@ \ : "=&r" (ret), "=&r" (oldval) \ : "r" (uaddr), "r" (oparg), "ir" (-EFAULT) \ - : "cc", "memory") + : "cc", "memory"); \ + smp_mb() \ #endif @@ -83,6 +87,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; +#ifndef CONFIG_ARC_HAS_LLSC + preempt_disable(); /* to guarantee atomic r-m-w of futex op */ +#endif pagefault_disable(); switch (op) { @@ -90,6 +97,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) __futex_atomic_op("mov %0, %3", ret, oldval, uaddr, oparg); break; case FUTEX_OP_ADD: + /* oldval = *uaddr; *uaddr += oparg ; ret = *uaddr */ __futex_atomic_op("add %0, %1, %3", ret, oldval, uaddr, oparg); break; case FUTEX_OP_OR: @@ -106,6 +114,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) } pagefault_enable(); +#ifndef CONFIG_ARC_HAS_LLSC + preempt_enable(); +#endif if (!ret) { switch (cmp) { @@ -134,54 +145,57 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) return ret; } -/* Compare-xchg with pagefaults disabled. - * Notes: - * -Best-Effort: Exchg happens only if compare succeeds. - * If compare fails, returns; leaving retry/looping to upper layers - * -successful cmp-xchg: return orig value in @addr (same as cmp val) - * -Compare fails: return orig value in @addr - * -user access r/w fails: return -EFAULT +/* + * cmpxchg of futex (pagefaults disabled by caller) + * Return 0 for success, -EFAULT otherwise */ static inline int -futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, - u32 newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval, + u32 newval) { - u32 val; + int ret = 0; + u32 existval; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - pagefault_disable(); +#ifndef CONFIG_ARC_HAS_LLSC + preempt_disable(); /* to guarantee atomic r-m-w of futex op */ +#endif + smp_mb(); __asm__ __volatile__( #ifdef CONFIG_ARC_HAS_LLSC - "1: llock %0, [%3] \n" - " brne %0, %1, 3f \n" - "2: scond %2, [%3] \n" + "1: llock %1, [%4] \n" + " brne %1, %2, 3f \n" + "2: scond %3, [%4] \n" " bnz 1b \n" #else - "1: ld %0, [%3] \n" - " brne %0, %1, 3f \n" - "2: st %2, [%3] \n" + "1: ld %1, [%4] \n" + " brne %1, %2, 3f \n" + "2: st %3, [%4] \n" #endif "3: \n" " .section .fixup,\"ax\" \n" - "4: mov %0, %4 \n" - " b 3b \n" + "4: mov %0, %5 \n" + " j 3b \n" " .previous \n" " .section __ex_table,\"a\" \n" " .align 4 \n" " .word 1b, 4b \n" " .word 2b, 4b \n" " .previous\n" - : "=&r"(val) - : "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT) + : "+&r"(ret), "=&r"(existval) + : "r"(expval), "r"(newval), "r"(uaddr), "ir"(-EFAULT) : "cc", "memory"); - pagefault_enable(); + smp_mb(); - *uval = val; - return val; +#ifndef CONFIG_ARC_HAS_LLSC + preempt_enable(); +#endif + *uval = existval; + return ret; } #endif diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h index 2b8880e953a2..5f071762fb1c 100644 --- a/arch/arc/include/asm/perf_event.h +++ b/arch/arc/include/asm/perf_event.h @@ -1,6 +1,7 @@ /* * Linux performance counter support for ARC * + * Copyright (C) 2014-2015 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2011-2013 Synopsys, Inc. (www.synopsys.com) * * This program is free software; you can redistribute it and/or modify @@ -12,8 +13,8 @@ #ifndef __ASM_PERF_EVENT_H #define __ASM_PERF_EVENT_H -/* real maximum varies per CPU, this is the maximum supported by the driver */ -#define ARC_PMU_MAX_HWEVENTS 64 +/* Max number of counters that PCT block may ever have */ +#define ARC_PERF_MAX_COUNTERS 32 #define ARC_REG_CC_BUILD 0xF6 #define ARC_REG_CC_INDEX 0x240 @@ -28,15 +29,22 @@ #define ARC_REG_PCT_CONFIG 0x254 #define ARC_REG_PCT_CONTROL 0x255 #define ARC_REG_PCT_INDEX 0x256 +#define ARC_REG_PCT_INT_CNTL 0x25C +#define ARC_REG_PCT_INT_CNTH 0x25D +#define ARC_REG_PCT_INT_CTRL 0x25E +#define ARC_REG_PCT_INT_ACT 0x25F + +#define ARC_REG_PCT_CONFIG_USER (1 << 18) /* count in user mode */ +#define ARC_REG_PCT_CONFIG_KERN (1 << 19) /* count in kernel mode */ #define ARC_REG_PCT_CONTROL_CC (1 << 16) /* clear counts */ #define ARC_REG_PCT_CONTROL_SN (1 << 17) /* snapshot */ struct arc_reg_pct_build { #ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int m:8, c:8, r:6, s:2, v:8; + unsigned int m:8, c:8, r:5, i:1, s:2, v:8; #else - unsigned int v:8, s:2, r:6, c:8, m:8; + unsigned int v:8, s:2, i:1, r:5, c:8, m:8; #endif }; @@ -95,10 +103,13 @@ static const char * const arc_pmu_ev_hw_map[] = { /* counts condition */ [PERF_COUNT_HW_INSTRUCTIONS] = "iall", - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", /* Excludes ZOL jumps */ [PERF_COUNT_ARC_BPOK] = "bpok", /* NP-NT, PT-T, PNT-NT */ +#ifdef CONFIG_ISA_ARCV2 + [PERF_COUNT_HW_BRANCH_MISSES] = "bpmp", +#else [PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */ - +#endif [PERF_COUNT_ARC_LDC] = "imemrdc", /* Instr: mem read cached */ [PERF_COUNT_ARC_STC] = "imemwrc", /* Instr: mem write cached */ diff --git a/arch/arc/kernel/entry-arcv2.S b/arch/arc/kernel/entry-arcv2.S index bd7105d3172f..8fa76567e402 100644 --- a/arch/arc/kernel/entry-arcv2.S +++ b/arch/arc/kernel/entry-arcv2.S @@ -57,13 +57,8 @@ VECTOR handle_interrupt ; (23) End of fixed IRQs .section .text, "ax",@progbits -res_service: ; processor restart - flag 0x1 ; not implemented - nop - nop - -reserved: ; processor restart - rtie ; jump to processor initializations +reserved: + flag 1 ; Unexpected event, halt ;##################### Interrupt Handling ############################## diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index f7a82fd4d601..589abf5172d6 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -42,7 +42,7 @@ ENTRY(ret_from_fork) ; when the forked child comes here from the __switch_to function ; r0 has the last task pointer. ; put last task in scheduler queue - bl @schedule_tail + jl @schedule_tail ld r9, [sp, PT_status32] brne r9, 0, 1f @@ -320,7 +320,7 @@ resume_user_mode_begin: ; --- (Slow Path #1) task preemption --- bbit0 r9, TIF_NEED_RESCHED, .Lchk_pend_signals mov blink, resume_user_mode_begin ; tail-call to U mode ret chks - b @schedule ; BTST+Bnz causes relo error in link + j @schedule ; BTST+Bnz causes relo error in link .Lchk_pend_signals: IRQ_ENABLE r10 @@ -381,7 +381,7 @@ resume_kernel_mode: bbit0 r9, TIF_NEED_RESCHED, .Lrestore_regs ; Invoke PREEMPTION - bl preempt_schedule_irq + jl preempt_schedule_irq ; preempt_schedule_irq() always returns with IRQ disabled #endif diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 1287388c258a..0c08bb1ce15a 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -1,7 +1,7 @@ /* * Linux performance counter support for ARC700 series * - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013-2015 Synopsys, Inc. (www.synopsys.com) * * This code is inspired by the perf support of various other architectures. * @@ -11,6 +11,7 @@ * */ #include +#include #include #include #include @@ -20,12 +21,25 @@ struct arc_pmu { struct pmu pmu; - int counter_size; /* in bits */ + unsigned int irq; int n_counters; - unsigned long used_mask[BITS_TO_LONGS(ARC_PMU_MAX_HWEVENTS)]; + u64 max_period; int ev_hw_idx[PERF_COUNT_ARC_HW_MAX]; }; +struct arc_pmu_cpu { + /* + * A 1 bit for an index indicates that the counter is being used for + * an event. A 0 means that the counter can be used. + */ + unsigned long used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)]; + + /* + * The events that are active on the PMU for the given index. + */ + struct perf_event *act_counter[ARC_PERF_MAX_COUNTERS]; +}; + struct arc_callchain_trace { int depth; void *perf_stuff; @@ -65,6 +79,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) } static struct arc_pmu *arc_pmu; +static DEFINE_PER_CPU(struct arc_pmu_cpu, arc_pmu_cpu); /* read counter #idx; note that counter# != event# on ARC! */ static uint64_t arc_pmu_read_counter(int idx) @@ -88,18 +103,15 @@ static uint64_t arc_pmu_read_counter(int idx) static void arc_perf_event_update(struct perf_event *event, struct hw_perf_event *hwc, int idx) { - uint64_t prev_raw_count, new_raw_count; - int64_t delta; - - do { - prev_raw_count = local64_read(&hwc->prev_count); - new_raw_count = arc_pmu_read_counter(idx); - } while (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count); - - delta = (new_raw_count - prev_raw_count) & - ((1ULL << arc_pmu->counter_size) - 1ULL); + uint64_t prev_raw_count = local64_read(&hwc->prev_count); + uint64_t new_raw_count = arc_pmu_read_counter(idx); + int64_t delta = new_raw_count - prev_raw_count; + /* + * We don't afaraid of hwc->prev_count changing beneath our feet + * because there's no way for us to re-enter this function anytime. + */ + local64_set(&hwc->prev_count, new_raw_count); local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); } @@ -142,22 +154,41 @@ static int arc_pmu_event_init(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int ret; + if (!is_sampling_event(event)) { + hwc->sample_period = arc_pmu->max_period; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + } + + hwc->config = 0; + + if (is_isa_arcv2()) { + /* "exclude user" means "count only kernel" */ + if (event->attr.exclude_user) + hwc->config |= ARC_REG_PCT_CONFIG_KERN; + + /* "exclude kernel" means "count only user" */ + if (event->attr.exclude_kernel) + hwc->config |= ARC_REG_PCT_CONFIG_USER; + } + switch (event->attr.type) { case PERF_TYPE_HARDWARE: if (event->attr.config >= PERF_COUNT_HW_MAX) return -ENOENT; if (arc_pmu->ev_hw_idx[event->attr.config] < 0) return -ENOENT; - hwc->config = arc_pmu->ev_hw_idx[event->attr.config]; + hwc->config |= arc_pmu->ev_hw_idx[event->attr.config]; pr_debug("init event %d with h/w %d \'%s\'\n", (int) event->attr.config, (int) hwc->config, arc_pmu_ev_hw_map[event->attr.config]); return 0; + case PERF_TYPE_HW_CACHE: ret = arc_pmu_cache_event(event->attr.config); if (ret < 0) return ret; - hwc->config = arc_pmu->ev_hw_idx[ret]; + hwc->config |= arc_pmu->ev_hw_idx[ret]; return 0; default: return -ENOENT; @@ -180,6 +211,47 @@ static void arc_pmu_disable(struct pmu *pmu) write_aux_reg(ARC_REG_PCT_CONTROL, (tmp & 0xffff0000) | 0x0); } +static int arc_pmu_event_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int idx = hwc->idx; + int overflow = 0; + u64 value; + + if (unlikely(left <= -period)) { + /* left underflowed by more than period. */ + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } else if (unlikely(left <= 0)) { + /* left underflowed by less than period. */ + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (left > arc_pmu->max_period) + left = arc_pmu->max_period; + + value = arc_pmu->max_period - left; + local64_set(&hwc->prev_count, value); + + /* Select counter */ + write_aux_reg(ARC_REG_PCT_INDEX, idx); + + /* Write value */ + write_aux_reg(ARC_REG_PCT_COUNTL, (u32)value); + write_aux_reg(ARC_REG_PCT_COUNTH, (value >> 32)); + + perf_event_update_userpage(event); + + return overflow; +} + /* * Assigns hardware counter to hardware condition. * Note that there is no separate start/stop mechanism; @@ -194,13 +266,20 @@ static void arc_pmu_start(struct perf_event *event, int flags) return; if (flags & PERF_EF_RELOAD) - WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); - event->hw.state = 0; + hwc->state = 0; + + arc_pmu_event_set_period(event); + + /* Enable interrupt for this counter */ + if (is_sampling_event(event)) + write_aux_reg(ARC_REG_PCT_INT_CTRL, + read_aux_reg(ARC_REG_PCT_INT_CTRL) | (1 << idx)); /* enable ARC pmu here */ - write_aux_reg(ARC_REG_PCT_INDEX, idx); - write_aux_reg(ARC_REG_PCT_CONFIG, hwc->config); + write_aux_reg(ARC_REG_PCT_INDEX, idx); /* counter # */ + write_aux_reg(ARC_REG_PCT_CONFIG, hwc->config); /* condition */ } static void arc_pmu_stop(struct perf_event *event, int flags) @@ -208,6 +287,17 @@ static void arc_pmu_stop(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; + /* Disable interrupt for this counter */ + if (is_sampling_event(event)) { + /* + * Reset interrupt flag by writing of 1. This is required + * to make sure pending interrupt was not left. + */ + write_aux_reg(ARC_REG_PCT_INT_ACT, 1 << idx); + write_aux_reg(ARC_REG_PCT_INT_CTRL, + read_aux_reg(ARC_REG_PCT_INT_CTRL) & ~(1 << idx)); + } + if (!(event->hw.state & PERF_HES_STOPPED)) { /* stop ARC pmu here */ write_aux_reg(ARC_REG_PCT_INDEX, idx); @@ -227,8 +317,12 @@ static void arc_pmu_stop(struct perf_event *event, int flags) static void arc_pmu_del(struct perf_event *event, int flags) { + struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu); + arc_pmu_stop(event, PERF_EF_UPDATE); - __clear_bit(event->hw.idx, arc_pmu->used_mask); + __clear_bit(event->hw.idx, pmu_cpu->used_mask); + + pmu_cpu->act_counter[event->hw.idx] = 0; perf_event_update_userpage(event); } @@ -236,20 +330,31 @@ static void arc_pmu_del(struct perf_event *event, int flags) /* allocate hardware counter and optionally start counting */ static int arc_pmu_add(struct perf_event *event, int flags) { + struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - if (__test_and_set_bit(idx, arc_pmu->used_mask)) { - idx = find_first_zero_bit(arc_pmu->used_mask, + if (__test_and_set_bit(idx, pmu_cpu->used_mask)) { + idx = find_first_zero_bit(pmu_cpu->used_mask, arc_pmu->n_counters); if (idx == arc_pmu->n_counters) return -EAGAIN; - __set_bit(idx, arc_pmu->used_mask); + __set_bit(idx, pmu_cpu->used_mask); hwc->idx = idx; } write_aux_reg(ARC_REG_PCT_INDEX, idx); + + pmu_cpu->act_counter[idx] = event; + + if (is_sampling_event(event)) { + /* Mimic full counter overflow as other arches do */ + write_aux_reg(ARC_REG_PCT_INT_CNTL, (u32)arc_pmu->max_period); + write_aux_reg(ARC_REG_PCT_INT_CNTH, + (arc_pmu->max_period >> 32)); + } + write_aux_reg(ARC_REG_PCT_CONFIG, 0); write_aux_reg(ARC_REG_PCT_COUNTL, 0); write_aux_reg(ARC_REG_PCT_COUNTH, 0); @@ -264,11 +369,82 @@ static int arc_pmu_add(struct perf_event *event, int flags) return 0; } +#ifdef CONFIG_ISA_ARCV2 +static irqreturn_t arc_pmu_intr(int irq, void *dev) +{ + struct perf_sample_data data; + struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu); + struct pt_regs *regs; + int active_ints; + int idx; + + arc_pmu_disable(&arc_pmu->pmu); + + active_ints = read_aux_reg(ARC_REG_PCT_INT_ACT); + + regs = get_irq_regs(); + + for (idx = 0; idx < arc_pmu->n_counters; idx++) { + struct perf_event *event = pmu_cpu->act_counter[idx]; + struct hw_perf_event *hwc; + + if (!(active_ints & (1 << idx))) + continue; + + /* Reset interrupt flag by writing of 1 */ + write_aux_reg(ARC_REG_PCT_INT_ACT, 1 << idx); + + /* + * On reset of "interrupt active" bit corresponding + * "interrupt enable" bit gets automatically reset as well. + * Now we need to re-enable interrupt for the counter. + */ + write_aux_reg(ARC_REG_PCT_INT_CTRL, + read_aux_reg(ARC_REG_PCT_INT_CTRL) | (1 << idx)); + + hwc = &event->hw; + + WARN_ON_ONCE(hwc->idx != idx); + + arc_perf_event_update(event, &event->hw, event->hw.idx); + perf_sample_data_init(&data, 0, hwc->last_period); + if (!arc_pmu_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + arc_pmu_stop(event, 0); + } + + arc_pmu_enable(&arc_pmu->pmu); + + return IRQ_HANDLED; +} +#else + +static irqreturn_t arc_pmu_intr(int irq, void *dev) +{ + return IRQ_NONE; +} + +#endif /* CONFIG_ISA_ARCV2 */ + +void arc_cpu_pmu_irq_init(void) +{ + struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu); + + arc_request_percpu_irq(arc_pmu->irq, smp_processor_id(), arc_pmu_intr, + "ARC perf counters", pmu_cpu); + + /* Clear all pending interrupt flags */ + write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff); +} + static int arc_pmu_device_probe(struct platform_device *pdev) { struct arc_reg_pct_build pct_bcr; struct arc_reg_cc_build cc_bcr; - int i, j; + int i, j, has_interrupts; + int counter_size; /* in bits */ union cc_name { struct { @@ -284,7 +460,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev) pr_err("This core does not have performance counters!\n"); return -ENODEV; } - BUG_ON(pct_bcr.c > ARC_PMU_MAX_HWEVENTS); + BUG_ON(pct_bcr.c > ARC_PERF_MAX_COUNTERS); READ_BCR(ARC_REG_CC_BUILD, cc_bcr); BUG_ON(!cc_bcr.v); /* Counters exist but No countable conditions ? */ @@ -293,11 +469,16 @@ static int arc_pmu_device_probe(struct platform_device *pdev) if (!arc_pmu) return -ENOMEM; - arc_pmu->n_counters = pct_bcr.c; - arc_pmu->counter_size = 32 + (pct_bcr.s << 4); + has_interrupts = is_isa_arcv2() ? pct_bcr.i : 0; - pr_info("ARC perf\t: %d counters (%d bits), %d countable conditions\n", - arc_pmu->n_counters, arc_pmu->counter_size, cc_bcr.c); + arc_pmu->n_counters = pct_bcr.c; + counter_size = 32 + (pct_bcr.s << 4); + + arc_pmu->max_period = (1ULL << counter_size) / 2 - 1ULL; + + pr_info("ARC perf\t: %d counters (%d bits), %d conditions%s\n", + arc_pmu->n_counters, counter_size, cc_bcr.c, + has_interrupts ? ", [overflow IRQ support]":""); cc_name.str[8] = 0; for (i = 0; i < PERF_COUNT_ARC_HW_MAX; i++) @@ -332,8 +513,37 @@ static int arc_pmu_device_probe(struct platform_device *pdev) .read = arc_pmu_read, }; - /* ARC 700 PMU does not support sampling events */ - arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + if (has_interrupts) { + int irq = platform_get_irq(pdev, 0); + unsigned long flags; + + if (irq < 0) { + pr_err("Cannot get IRQ number for the platform\n"); + return -ENODEV; + } + + arc_pmu->irq = irq; + + /* + * arc_cpu_pmu_irq_init() needs to be called on all cores for + * their respective local PMU. + * However we use opencoded on_each_cpu() to ensure it is called + * on core0 first, so that arc_request_percpu_irq() sets up + * AUTOEN etc. Otherwise enable_percpu_irq() fails to enable + * perf IRQ on non master cores. + * see arc_request_percpu_irq() + */ + preempt_disable(); + local_irq_save(flags); + arc_cpu_pmu_irq_init(); + local_irq_restore(flags); + smp_call_function((smp_call_func_t)arc_cpu_pmu_irq_init, 0, 1); + preempt_enable(); + + /* Clean all pending interrupt flags */ + write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff); + } else + arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; return perf_pmu_register(&arc_pmu->pmu, pdev->name, PERF_TYPE_RAW); } @@ -341,6 +551,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev) #ifdef CONFIG_OF static const struct of_device_id arc_pmu_match[] = { { .compatible = "snps,arc700-pct" }, + { .compatible = "snps,archs-pct" }, {}, }; MODULE_DEVICE_TABLE(of, arc_pmu_match); @@ -348,7 +559,7 @@ MODULE_DEVICE_TABLE(of, arc_pmu_match); static struct platform_driver arc_pmu_driver = { .driver = { - .name = "arc700-pct", + .name = "arc-pct", .of_match_table = of_match_ptr(arc_pmu_match), }, .probe = arc_pmu_device_probe, diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 44092456776f..91d5a0f1f3f7 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -65,7 +65,7 @@ asmlinkage void ret_from_fork(void); * ------------------ * | r25 | <==== top of Stack (thread.ksp) * ~ ~ - * | --to-- | (CALLEE Regs of user mode) + * | --to-- | (CALLEE Regs of kernel mode) * | r13 | * ------------------ * | fp | diff --git a/arch/arc/kernel/unaligned.c b/arch/arc/kernel/unaligned.c index 74db59b6f392..abd961f3e763 100644 --- a/arch/arc/kernel/unaligned.c +++ b/arch/arc/kernel/unaligned.c @@ -34,7 +34,7 @@ " .section .fixup,\"ax\"\n" \ " .align 4\n" \ "3: mov %0, 1\n" \ - " b 2b\n" \ + " j 2b\n" \ " .previous\n" \ " .section __ex_table,\"a\"\n" \ " .align 4\n" \ @@ -82,7 +82,7 @@ " .section .fixup,\"ax\"\n" \ " .align 4\n" \ "4: mov %0, 1\n" \ - " b 3b\n" \ + " j 3b\n" \ " .previous\n" \ " .section __ex_table,\"a\"\n" \ " .align 4\n" \ @@ -113,7 +113,7 @@ " .section .fixup,\"ax\"\n" \ " .align 4\n" \ "6: mov %0, 1\n" \ - " b 5b\n" \ + " j 5b\n" \ " .previous\n" \ " .section __ex_table,\"a\"\n" \ " .align 4\n" \ diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 1cd6695b6ab5..0d1a6e96839f 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -22,15 +22,22 @@ #include static int l2_line_sz; +int ioc_exists; +volatile int slc_enable = 1, ioc_enable = 1; void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr, unsigned long sz, const int cacheop); +void (*__dma_cache_wback_inv)(unsigned long start, unsigned long sz); +void (*__dma_cache_inv)(unsigned long start, unsigned long sz); +void (*__dma_cache_wback)(unsigned long start, unsigned long sz); + char *arc_cache_mumbojumbo(int c, char *buf, int len) { int n = 0; struct cpuinfo_arc_cache *p; +#define IS_USED_RUN(v) ((v) ? "" : "(disabled) ") #define PR_CACHE(p, cfg, str) \ if (!(p)->ver) \ n += scnprintf(buf + n, len - n, str"\t\t: N/A\n"); \ @@ -45,10 +52,18 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len) PR_CACHE(&cpuinfo_arc700[c].icache, CONFIG_ARC_HAS_ICACHE, "I-Cache"); PR_CACHE(&cpuinfo_arc700[c].dcache, CONFIG_ARC_HAS_DCACHE, "D-Cache"); + if (!is_isa_arcv2()) + return buf; + p = &cpuinfo_arc700[c].slc; if (p->ver) n += scnprintf(buf + n, len - n, - "SLC\t\t: %uK, %uB Line\n", p->sz_k, p->line_len); + "SLC\t\t: %uK, %uB Line%s\n", + p->sz_k, p->line_len, IS_USED_RUN(slc_enable)); + + if (ioc_exists) + n += scnprintf(buf + n, len - n, "IOC\t\t:%s\n", + IS_USED_RUN(ioc_enable)); return buf; } @@ -58,18 +73,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len) * the cpuinfo structure for later use. * No Validation done here, simply read/convert the BCRs */ -void read_decode_cache_bcr(void) +static void read_decode_cache_bcr_arcv2(int cpu) { - struct cpuinfo_arc_cache *p_ic, *p_dc, *p_slc; - unsigned int cpu = smp_processor_id(); - struct bcr_cache { -#ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int pad:12, line_len:4, sz:4, config:4, ver:8; -#else - unsigned int ver:8, config:4, sz:4, line_len:4, pad:12; -#endif - } ibcr, dbcr; - + struct cpuinfo_arc_cache *p_slc = &cpuinfo_arc700[cpu].slc; struct bcr_generic sbcr; struct bcr_slc_cfg { @@ -80,6 +86,39 @@ void read_decode_cache_bcr(void) #endif } slc_cfg; + struct bcr_clust_cfg { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad:7, c:1, num_entries:8, num_cores:8, ver:8; +#else + unsigned int ver:8, num_cores:8, num_entries:8, c:1, pad:7; +#endif + } cbcr; + + READ_BCR(ARC_REG_SLC_BCR, sbcr); + if (sbcr.ver) { + READ_BCR(ARC_REG_SLC_CFG, slc_cfg); + p_slc->ver = sbcr.ver; + p_slc->sz_k = 128 << slc_cfg.sz; + l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64; + } + + READ_BCR(ARC_REG_CLUSTER_BCR, cbcr); + if (cbcr.c && ioc_enable) + ioc_exists = 1; +} + +void read_decode_cache_bcr(void) +{ + struct cpuinfo_arc_cache *p_ic, *p_dc; + unsigned int cpu = smp_processor_id(); + struct bcr_cache { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad:12, line_len:4, sz:4, config:4, ver:8; +#else + unsigned int ver:8, config:4, sz:4, line_len:4, pad:12; +#endif + } ibcr, dbcr; + p_ic = &cpuinfo_arc700[cpu].icache; READ_BCR(ARC_REG_IC_BCR, ibcr); @@ -122,17 +161,8 @@ dc_chk: p_dc->ver = dbcr.ver; slc_chk: - if (!is_isa_arcv2()) - return; - - p_slc = &cpuinfo_arc700[cpu].slc; - READ_BCR(ARC_REG_SLC_BCR, sbcr); - if (sbcr.ver) { - READ_BCR(ARC_REG_SLC_CFG, slc_cfg); - p_slc->ver = sbcr.ver; - p_slc->sz_k = 128 << slc_cfg.sz; - l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64; - } + if (is_isa_arcv2()) + read_decode_cache_bcr_arcv2(cpu); } /* @@ -516,11 +546,6 @@ noinline void slc_op(unsigned long paddr, unsigned long sz, const int op) #endif } -static inline int need_slc_flush(void) -{ - return is_isa_arcv2() && l2_line_sz; -} - /*********************************************************** * Exported APIs */ @@ -569,30 +594,74 @@ void flush_dcache_page(struct page *page) } EXPORT_SYMBOL(flush_dcache_page); -void dma_cache_wback_inv(unsigned long start, unsigned long sz) +/* + * DMA ops for systems with L1 cache only + * Make memory coherent with L1 cache by flushing/invalidating L1 lines + */ +static void __dma_cache_wback_inv_l1(unsigned long start, unsigned long sz) { __dc_line_op_k(start, sz, OP_FLUSH_N_INV); +} - if (need_slc_flush()) - slc_op(start, sz, OP_FLUSH_N_INV); +static void __dma_cache_inv_l1(unsigned long start, unsigned long sz) +{ + __dc_line_op_k(start, sz, OP_INV); +} + +static void __dma_cache_wback_l1(unsigned long start, unsigned long sz) +{ + __dc_line_op_k(start, sz, OP_FLUSH); +} + +/* + * DMA ops for systems with both L1 and L2 caches, but without IOC + * Both L1 and L2 lines need to be explicity flushed/invalidated + */ +static void __dma_cache_wback_inv_slc(unsigned long start, unsigned long sz) +{ + __dc_line_op_k(start, sz, OP_FLUSH_N_INV); + slc_op(start, sz, OP_FLUSH_N_INV); +} + +static void __dma_cache_inv_slc(unsigned long start, unsigned long sz) +{ + __dc_line_op_k(start, sz, OP_INV); + slc_op(start, sz, OP_INV); +} + +static void __dma_cache_wback_slc(unsigned long start, unsigned long sz) +{ + __dc_line_op_k(start, sz, OP_FLUSH); + slc_op(start, sz, OP_FLUSH); +} + +/* + * DMA ops for systems with IOC + * IOC hardware snoops all DMA traffic keeping the caches consistent with + * memory - eliding need for any explicit cache maintenance of DMA buffers + */ +static void __dma_cache_wback_inv_ioc(unsigned long start, unsigned long sz) {} +static void __dma_cache_inv_ioc(unsigned long start, unsigned long sz) {} +static void __dma_cache_wback_ioc(unsigned long start, unsigned long sz) {} + +/* + * Exported DMA API + */ +void dma_cache_wback_inv(unsigned long start, unsigned long sz) +{ + __dma_cache_wback_inv(start, sz); } EXPORT_SYMBOL(dma_cache_wback_inv); void dma_cache_inv(unsigned long start, unsigned long sz) { - __dc_line_op_k(start, sz, OP_INV); - - if (need_slc_flush()) - slc_op(start, sz, OP_INV); + __dma_cache_inv(start, sz); } EXPORT_SYMBOL(dma_cache_inv); void dma_cache_wback(unsigned long start, unsigned long sz) { - __dc_line_op_k(start, sz, OP_FLUSH); - - if (need_slc_flush()) - slc_op(start, sz, OP_FLUSH); + __dma_cache_wback(start, sz); } EXPORT_SYMBOL(dma_cache_wback); @@ -848,4 +917,41 @@ void arc_cache_init(void) panic("Disable CONFIG_ARC_CACHE_VIPT_ALIASING\n"); } } + + if (is_isa_arcv2() && l2_line_sz && !slc_enable) { + + /* IM set : flush before invalidate */ + write_aux_reg(ARC_REG_SLC_CTRL, + read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_IM); + + write_aux_reg(ARC_REG_SLC_INVALIDATE, 1); + + /* Important to wait for flush to complete */ + while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY); + write_aux_reg(ARC_REG_SLC_CTRL, + read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_DISABLE); + } + + if (is_isa_arcv2() && ioc_exists) { + /* IO coherency base - 0x8z */ + write_aux_reg(ARC_REG_IO_COH_AP0_BASE, 0x80000); + /* IO coherency aperture size - 512Mb: 0x8z-0xAz */ + write_aux_reg(ARC_REG_IO_COH_AP0_SIZE, 0x11); + /* Enable partial writes */ + write_aux_reg(ARC_REG_IO_COH_PARTIAL, 1); + /* Enable IO coherency */ + write_aux_reg(ARC_REG_IO_COH_ENABLE, 1); + + __dma_cache_wback_inv = __dma_cache_wback_inv_ioc; + __dma_cache_inv = __dma_cache_inv_ioc; + __dma_cache_wback = __dma_cache_wback_ioc; + } else if (is_isa_arcv2() && l2_line_sz && slc_enable) { + __dma_cache_wback_inv = __dma_cache_wback_inv_slc; + __dma_cache_inv = __dma_cache_inv_slc; + __dma_cache_wback = __dma_cache_wback_slc; + } else { + __dma_cache_wback_inv = __dma_cache_wback_inv_l1; + __dma_cache_inv = __dma_cache_inv_l1; + __dma_cache_wback = __dma_cache_wback_l1; + } } diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 57706a9c6948..29a46bb198cc 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -19,6 +19,7 @@ #include #include #include +#include #include /* @@ -53,6 +54,20 @@ void *dma_alloc_coherent(struct device *dev, size_t size, { void *paddr, *kvaddr; + /* + * IOC relies on all data (even coherent DMA data) being in cache + * Thus allocate normal cached memory + * + * The gains with IOC are two pronged: + * -For streaming data, elides needs for cache maintenance, saving + * cycles in flush code, and bus bandwidth as all the lines of a + * buffer need to be flushed out to memory + * -For coherent data, Read/Write to buffers terminate early in cache + * (vs. always going to memory - thus are faster) + */ + if (is_isa_arcv2() && ioc_exists) + return dma_alloc_noncoherent(dev, size, dma_handle, gfp); + /* This is linear addr (0x8000_0000 based) */ paddr = alloc_pages_exact(size, gfp); if (!paddr) @@ -85,6 +100,9 @@ EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *kvaddr, dma_addr_t dma_handle) { + if (is_isa_arcv2() && ioc_exists) + return dma_free_noncoherent(dev, size, kvaddr, dma_handle); + iounmap((void __force __iomem *)kvaddr); free_pages_exact((void *)dma_handle, size); diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c index e7769c3ab5f2..ad9825d4026a 100644 --- a/arch/arc/plat-axs10x/axs10x.c +++ b/arch/arc/plat-axs10x/axs10x.c @@ -46,7 +46,7 @@ static void __init axs10x_enable_gpio_intc_wire(void) * ------------------- ------------------- * | snps,dw-apb-gpio | | snps,dw-apb-gpio | * ------------------- ------------------- - * | | + * | #12 | * | [ Debug UART on cpu card ] * | * ------------------------