diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h
index f69f69c8120c..da84645525b9 100644
--- a/arch/arm64/include/asm/kgdb.h
+++ b/arch/arm64/include/asm/kgdb.h
@@ -38,25 +38,54 @@ extern int kgdb_fault_expected;
 #endif /* !__ASSEMBLY__ */
 
 /*
- * gdb is expecting the following registers layout.
+ * gdb remote procotol (well most versions of it) expects the following
+ * register layout.
  *
  * General purpose regs:
  *     r0-r30: 64 bit
  *     sp,pc : 64 bit
- *     pstate  : 64 bit
- *     Total: 34
+ *     pstate  : 32 bit
+ *     Total: 33 + 1
  * FPU regs:
  *     f0-f31: 128 bit
- *     Total: 32
- * Extra regs
  *     fpsr & fpcr: 32 bit
- *     Total: 2
+ *     Total: 32 + 2
  *
+ * To expand a little on the "most versions of it"... when the gdb remote
+ * protocol for AArch64 was developed it depended on a statement in the
+ * Architecture Reference Manual that claimed "SPSR_ELx is a 32-bit register".
+ * and, as a result, allocated only 32-bits for the PSTATE in the remote
+ * protocol. In fact this statement is still present in ARM DDI 0487A.i.
+ *
+ * Unfortunately "is a 32-bit register" has a very special meaning for
+ * system registers. It means that "the upper bits, bits[63:32], are
+ * RES0.". RES0 is heavily used in the ARM architecture documents as a
+ * way to leave space for future architecture changes. So to translate a
+ * little for people who don't spend their spare time reading ARM architecture
+ * manuals, what "is a 32-bit register" actually means in this context is
+ * "is a 64-bit register but one with no meaning allocated to any of the
+ * upper 32-bits... *yet*".
+ *
+ * Perhaps then we should not be surprised that this has led to some
+ * confusion. Specifically a patch, influenced by the above translation,
+ * that extended PSTATE to 64-bit was accepted into gdb-7.7 but the patch
+ * was reverted in gdb-7.8.1 and all later releases, when this was
+ * discovered to be an undocumented protocol change.
+ *
+ * So... it is *not* wrong for us to only allocate 32-bits to PSTATE
+ * here even though the kernel itself allocates 64-bits for the same
+ * state. That is because this bit of code tells the kernel how the gdb
+ * remote protocol (well most versions of it) describes the register state.
+ *
+ * Note that if you are using one of the versions of gdb that supports
+ * the gdb-7.7 version of the protocol you cannot use kgdb directly
+ * without providing a custom register description (gdb can load new
+ * protocol descriptions at runtime).
  */
 
-#define _GP_REGS		34
+#define _GP_REGS		33
 #define _FP_REGS		32
-#define _EXTRA_REGS		2
+#define _EXTRA_REGS		3
 /*
  * general purpose registers size in bytes.
  * pstate is only 4 bytes. subtract 4 bytes
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index fc9682bfe002..e875a5a551d7 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -30,22 +30,53 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
 	unsigned int tmp;
 	arch_spinlock_t lockval;
+	u32 owner;
+
+	/*
+	 * Ensure prior spin_lock operations to other locks have completed
+	 * on this CPU before we test whether "lock" is locked.
+	 */
+	smp_mb();
+	owner = READ_ONCE(lock->owner) << 16;
 
 	asm volatile(
 "	sevl\n"
 "1:	wfe\n"
 "2:	ldaxr	%w0, %2\n"
+	/* Is the lock free? */
 "	eor	%w1, %w0, %w0, ror #16\n"
-"	cbnz	%w1, 1b\n"
+"	cbz	%w1, 3f\n"
+	/* Lock taken -- has there been a subsequent unlock->lock transition? */
+"	eor	%w1, %w3, %w0, lsl #16\n"
+"	cbz	%w1, 1b\n"
+	/*
+	 * The owner has been updated, so there was an unlock->lock
+	 * transition that we missed. That means we can rely on the
+	 * store-release of the unlock operation paired with the
+	 * load-acquire of the lock operation to publish any of our
+	 * previous stores to the new lock owner and therefore don't
+	 * need to bother with the writeback below.
+	 */
+"	b	4f\n"
+"3:\n"
+	/*
+	 * Serialise against any concurrent lockers by writing back the
+	 * unlocked lock value
+	 */
 	ARM64_LSE_ATOMIC_INSN(
 	/* LL/SC */
 "	stxr	%w1, %w0, %2\n"
-"	cbnz	%w1, 2b\n", /* Serialise against any concurrent lockers */
-	/* LSE atomics */
 "	nop\n"
-"	nop\n")
+"	nop\n",
+	/* LSE atomics */
+"	mov	%w1, %w0\n"
+"	cas	%w0, %w0, %2\n"
+"	eor	%w1, %w1, %w0\n")
+	/* Somebody else wrote to the lock, GOTO 10 and reload the value */
+"	cbnz	%w1, 2b\n"
+"4:"
 	: "=&r" (lockval), "=&r" (tmp), "+Q" (*lock)
-	:
+	: "r" (owner)
 	: "memory");
 }
 
@@ -148,6 +179,7 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
+	smp_mb(); /* See arch_spin_unlock_wait */
 	return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
 
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index b67531a13136..b5f063e5eff7 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -58,7 +58,17 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
 	{ "x30", 8, offsetof(struct pt_regs, regs[30])},
 	{ "sp", 8, offsetof(struct pt_regs, sp)},
 	{ "pc", 8, offsetof(struct pt_regs, pc)},
-	{ "pstate", 8, offsetof(struct pt_regs, pstate)},
+	/*
+	 * struct pt_regs thinks PSTATE is 64-bits wide but gdb remote
+	 * protocol disagrees. Therefore we must extract only the lower
+	 * 32-bits. Look for the big comment in asm/kgdb.h for more
+	 * detail.
+	 */
+	{ "pstate", 4, offsetof(struct pt_regs, pstate)
+#ifdef CONFIG_CPU_BIG_ENDIAN
+							+ 4
+#endif
+	},
 	{ "v0", 16, -1 },
 	{ "v1", 16, -1 },
 	{ "v2", 16, -1 },
@@ -128,6 +138,8 @@ sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
 	memset((char *)gdb_regs, 0, NUMREGBYTES);
 	thread_regs = task_pt_regs(task);
 	memcpy((void *)gdb_regs, (void *)thread_regs->regs, GP_REG_BYTES);
+	/* Special case for PSTATE (check comments in asm/kgdb.h for details) */
+	dbg_get_reg(33, gdb_regs + GP_REG_BYTES, thread_regs);
 }
 
 void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index f7cf463107df..2a43012616b7 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -64,8 +64,7 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom,
 
 	/*
 	 * We need to switch to kernel mode so that we can use __get_user
-	 * to safely read from kernel space.  Note that we now dump the
-	 * code first, just in case the backtrace kills us.
+	 * to safely read from kernel space.
 	 */
 	fs = get_fs();
 	set_fs(KERNEL_DS);
@@ -111,21 +110,12 @@ static void dump_backtrace_entry(unsigned long where)
 	print_ip_sym(where);
 }
 
-static void dump_instr(const char *lvl, struct pt_regs *regs)
+static void __dump_instr(const char *lvl, struct pt_regs *regs)
 {
 	unsigned long addr = instruction_pointer(regs);
-	mm_segment_t fs;
 	char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str;
 	int i;
 
-	/*
-	 * We need to switch to kernel mode so that we can use __get_user
-	 * to safely read from kernel space.  Note that we now dump the
-	 * code first, just in case the backtrace kills us.
-	 */
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-
 	for (i = -4; i < 1; i++) {
 		unsigned int val, bad;
 
@@ -139,8 +129,18 @@ static void dump_instr(const char *lvl, struct pt_regs *regs)
 		}
 	}
 	printk("%sCode: %s\n", lvl, str);
+}
 
-	set_fs(fs);
+static void dump_instr(const char *lvl, struct pt_regs *regs)
+{
+	if (!user_mode(regs)) {
+		mm_segment_t fs = get_fs();
+		set_fs(KERNEL_DS);
+		__dump_instr(lvl, regs);
+		set_fs(fs);
+	} else {
+		__dump_instr(lvl, regs);
+	}
 }
 
 static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ba3fc12bd272..013e2cbe7924 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -441,7 +441,7 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	return 1;
 }
 
-static struct fault_info {
+static const struct fault_info {
 	int	(*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs);
 	int	sig;
 	int	code;
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 1b8304e1efaa..140436a046c0 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -1010,8 +1010,8 @@ int arm_pmu_device_probe(struct platform_device *pdev,
 		if (!ret)
 			ret = init_fn(pmu);
 	} else {
-		ret = probe_current_pmu(pmu, probe_table);
 		cpumask_setall(&pmu->supported_cpus);
+		ret = probe_current_pmu(pmu, probe_table);
 	}
 
 	if (ret) {