From 27f6c573e0f77f7d1cc907c1494c99a61e48b7d8 Mon Sep 17 00:00:00 2001
From: "Chen, Gong" <gong.chen@linux.intel.com>
Date: Thu, 27 Mar 2014 21:24:36 -0400
Subject: [PATCH 1/7] x86, CMCI: Add proper detection of end of CMCI storms

When CMCI storm persists for a long time(at least beyond predefined
threshold. It's 30 seconds for now), we can watch CMCI storm is
detected immediately after it subsides.

...
Dec 10 22:04:29 kernel: CMCI storm detected: switching to poll mode
Dec 10 22:04:59 kernel: CMCI storm subsided: switching to interrupt mode
Dec 10 22:04:59 kernel: CMCI storm detected: switching to poll mode
Dec 10 22:05:29 kernel: CMCI storm subsided: switching to interrupt mode
...

The problem is that our logic that determines that the storm has
ended is incorrect. We announce the end, re-enable interrupts and
realize that the storm is still going on, so we switch back to
polling mode. Rinse, repeat.

When a storm happens we disable signaling of errors via CMCI and begin
polling machine check banks instead. If we find any logged errors,
then we need to set a per-cpu flag so that our per-cpu tests that
check whether the storm is ongoing will see that errors are still
being logged independently of whether mce_notify_irq() says that the
error has been fully processed.

cmci_clear() is not the right tool to disable a bank. It disables the
interrupt for the bank as desired, but it also clears the bit for
this bank in "mce_banks_owned" so we will skip the bank when polling
(so we fail to see that the storm continues because we stop looking).
New cmci_storm_disable_banks() just disables the interrupt while
allowing polling to continue.

Reported-by: William Dauchy <wdauchy@gmail.com>
Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c       | 18 +++++++++++++++++-
 arch/x86/kernel/cpu/mcheck/mce_intel.c | 19 ++++++++++++++++++-
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 4d5419b249da..78c92125db8a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -89,6 +89,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int			cpu_missing;
 
+/* CMCI storm detection filter */
+static DEFINE_PER_CPU(unsigned long, mce_polled_error);
+
 /*
  * MCA banks polled by the period polling timer for corrected events.
  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -595,6 +598,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 {
 	struct mce m;
 	int i;
+	unsigned long *v;
 
 	this_cpu_inc(mce_poll_count);
 
@@ -614,6 +618,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		if (!(m.status & MCI_STATUS_VAL))
 			continue;
 
+		v = &get_cpu_var(mce_polled_error);
+		set_bit(0, v);
 		/*
 		 * Uncorrected or signalled events are handled by the exception
 		 * handler when it is enabled, so don't process those here.
@@ -1278,10 +1284,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
 static unsigned long (*mce_adjust_timer)(unsigned long interval) =
 	mce_adjust_timer_default;
 
+static int cmc_error_seen(void)
+{
+	unsigned long *v = &__get_cpu_var(mce_polled_error);
+
+	return test_and_clear_bit(0, v);
+}
+
 static void mce_timer_fn(unsigned long data)
 {
 	struct timer_list *t = &__get_cpu_var(mce_timer);
 	unsigned long iv;
+	int notify;
 
 	WARN_ON(smp_processor_id() != data);
 
@@ -1296,7 +1310,9 @@ static void mce_timer_fn(unsigned long data)
 	 * polling interval, otherwise increase the polling interval.
 	 */
 	iv = __this_cpu_read(mce_next_interval);
-	if (mce_notify_irq()) {
+	notify = mce_notify_irq();
+	notify |= cmc_error_seen();
+	if (notify) {
 		iv = max(iv / 2, (unsigned long) HZ/100);
 	} else {
 		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index fb6156fee6f7..3bdb95ae8c43 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -9,6 +9,7 @@
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
+#include <linux/cpumask.h>
 #include <asm/apic.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
@@ -137,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
 	}
 }
 
+static void cmci_storm_disable_banks(void)
+{
+	unsigned long flags, *owned;
+	int bank;
+	u64 val;
+
+	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+	owned = __get_cpu_var(mce_banks_owned);
+	for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+		rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+		val &= ~MCI_CTL2_CMCI_EN;
+		wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+	}
+	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
 static bool cmci_storm_detect(void)
 {
 	unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -158,7 +175,7 @@ static bool cmci_storm_detect(void)
 	if (cnt <= CMCI_STORM_THRESHOLD)
 		return false;
 
-	cmci_clear();
+	cmci_storm_disable_banks();
 	__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
 	r = atomic_add_return(1, &cmci_storm_on_cpus);
 	mce_timer_kick(CMCI_POLL_INTERVAL);

From 023de4a09f571fad0af9691e4e437e14b68f05fb Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 1 Apr 2014 13:30:21 +0100
Subject: [PATCH 2/7] x86/apic: Reinstate error IRQ Pentium erratum 3AP
 workaround

A change introduced with commit 60283df7ac26a4fe2d56631ca2946e04725e7eaf
("x86/apic: Read Error Status Register correctly") removed a read from the
APIC ESR register made before writing to same required to retrieve the
correct error status on Pentium systems affected by the 3AP erratum[1]:

	"3AP. Writes to Error Register Clears Register

	PROBLEM: The APIC Error register is intended to only be read.
	If there is a write to this register the data in the APIC Error
	register will be cleared and lost.

	IMPLICATION: There is a possibility of clearing the Error
	register status since the write to the register is not
	specifically blocked.

	WORKAROUND: Writes should not occur to the Pentium processor
	APIC Error register.

	STATUS: For the steppings affected see the Summary Table of
	Changes at the beginning of this section."

The steppings affected are actually: B1, B3 and B5.

To avoid this information loss this change avoids the write to
ESR on all Pentium systems where it is actually never needed;
in Pentium processor documentation ESR was noted read-only and
the write only required for future architectural
compatibility[2].

The approach taken is the same as in lapic_setup_esr().

References:

	[1] "Pentium Processor Family Developer's Manual", Intel Corporation,
	    1997, order number 241428-005, Appendix A "Errata and S-Specs for the
	    Pentium Processor Family", p. A-92,

	[2] "Pentium Processor Family Developer's Manual, Volume 3: Architecture
	    and Programming Manual", Intel Corporation, 1995, order number
	    241430-004, Section 19.3.3. "Error Handling In APIC", p. 19-33.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Link: http://lkml.kernel.org/r/alpine.LFD.2.11.1404011300010.27402@eddie.linux-mips.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 53e20531470e..005ed3fb0391 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1996,7 +1996,8 @@ static inline void __smp_error_interrupt(struct pt_regs *regs)
 	};
 
 	/* First tickle the hardware, only then report what went on. -- REW */
-	apic_write(APIC_ESR, 0);
+	if (lapic_get_maxlvt() > 3)	/* Due to the Pentium erratum 3AP. */
+		apic_write(APIC_ESR, 0);
 	v = apic_read(APIC_ESR);
 	ack_APIC_irq();
 	atomic_inc(&irq_err_count);

From f704a7d7f1d815621cb4c47f7a94787e1bd7c27c Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Tue, 1 Apr 2014 23:51:42 -0700
Subject: [PATCH 3/7] x86/platform/hyperv: Handle VMBUS driver being a module

Hyper-V VMBUS driver can be a module; handle this case
correctly. Please apply.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Cc: olaf@aepfle.de
Cc: apw@canonical.com
Cc: jasowang@redhat.com
Link: http://lkml.kernel.org/r/1396421502-23222-1-git-send-email-kys@microsoft.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 42805fac0092..283a76a9cc40 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -125,7 +125,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
 	seq_printf(p, "  Machine check polls\n");
 #endif
-#if defined(CONFIG_HYPERV) || defined(CONFIG_XEN)
+#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
 	seq_printf(p, "%*s: ", prec, "THR");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);

From 396f1a08db212138418b38f784e4bbe516d2fdb2 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Thu, 10 Apr 2014 13:30:13 +0100
Subject: [PATCH 4/7] x86/efi: Fix boot failure with EFI stub

commit 54b52d872680 ("x86/efi: Build our own EFI services pointer
table") introduced a regression because the 64-bit file_size()
implementation passed a pointer to a 32-bit data object, instead of a
pointer to a 64-bit object.

Because the firmware treats the object as 64-bits regardless it was
reading random values from the stack for the upper 32-bits.

This resulted in people being unable to boot their machines, after
seeing the following error messages,

    Failed to get file info size
    Failed to alloc highmem for files

Reported-by: Dzmitry Sledneu <dzmitry.sledneu@gmail.com>
Reported-by: Koen Kooi <koen@dominion.thruhere.net>
Tested-by: Koen Kooi <koen@dominion.thruhere.net>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/boot/compressed/eboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 1e6146137f8e..280165524ee4 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -112,7 +112,7 @@ __file_size64(void *__fh, efi_char16_t *filename_16,
 	efi_file_info_t *info;
 	efi_status_t status;
 	efi_guid_t info_guid = EFI_FILE_INFO_ID;
-	u32 info_sz;
+	u64 info_sz;
 
 	status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16,
 				 EFI_FILE_MODE_READ, (u64)0);

From 7e8213c1f3acc064aef37813a39f13cbfe7c3ce7 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@console-pimps.org>
Date: Tue, 8 Apr 2014 13:14:00 +0100
Subject: [PATCH 5/7] x86/efi: Correct EFI boot stub use of code32_start
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

code32_start should point at the start of the protected mode code, and
*not* at the beginning of the bzImage. This is much easier to do in
assembly so document that callers of make_boot_params() need to fill out
code32_start.

The fallout from this bug is that we would end up relocating the image
but copying the image at some offset, resulting in what appeared to be
memory corruption.

Reported-by: Thomas Bächler <thomas@archlinux.org>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/boot/compressed/eboot.c   | 5 +++--
 arch/x86/boot/compressed/head_32.S | 8 ++------
 arch/x86/boot/compressed/head_64.S | 9 +++------
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 280165524ee4..91d17007323b 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1016,6 +1016,9 @@ void setup_graphics(struct boot_params *boot_params)
  * Because the x86 boot code expects to be passed a boot_params we
  * need to create one ourselves (usually the bootloader would create
  * one for us).
+ *
+ * The caller is responsible for filling out ->code32_start in the
+ * returned boot_params.
  */
 struct boot_params *make_boot_params(struct efi_config *c)
 {
@@ -1081,8 +1084,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
 	hdr->vid_mode = 0xffff;
 	hdr->boot_flag = 0xAA55;
 
-	hdr->code32_start = (__u64)(unsigned long)image->image_base;
-
 	hdr->type_of_loader = 0x21;
 
 	/* Convert unicode cmdline to ascii */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index de9d4200d305..cbed1407a5cd 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -59,6 +59,7 @@ ENTRY(efi_pe_entry)
 	call	make_boot_params
 	cmpl	$0, %eax
 	je	fail
+	movl	%esi, BP_code32_start(%eax)
 	popl	%ecx
 	pushl	%eax
 	pushl	%ecx
@@ -90,12 +91,7 @@ fail:
 	hlt
 	jmp	fail
 2:
-	call	3f
-3:
-	popl	%eax
-	subl	$3b, %eax
-	subl	BP_pref_address(%esi), %eax
-	add	BP_code32_start(%esi), %eax
+	movl	BP_code32_start(%esi), %eax
 	leal	preferred_addr(%eax), %eax
 	jmp	*%eax
 
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 57e58a5fa210..0d558ee899ae 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -261,6 +261,8 @@ ENTRY(efi_pe_entry)
 	cmpq	$0,%rax
 	je	fail
 	mov	%rax, %rsi
+	leaq	startup_32(%rip), %rax
+	movl	%eax, BP_code32_start(%rsi)
 	jmp	2f		/* Skip the relocation */
 
 handover_entry:
@@ -284,12 +286,7 @@ fail:
 	hlt
 	jmp	fail
 2:
-	call	3f
-3:
-	popq	%rax
-	subq	$3b, %rax
-	subq	BP_pref_address(%rsi), %rax
-	add	BP_code32_start(%esi), %eax
+	movl	BP_code32_start(%esi), %eax
 	leaq	preferred_addr(%rax), %rax
 	jmp	*%rax
 

From 47514c996fac5e6f13ef3a4c5e23f1c5cffabb7b Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Thu, 10 Apr 2014 14:11:45 +0100
Subject: [PATCH 6/7] efi: Pass correct file handle to efi_file_{read,close}

We're currently passing the file handle for the root file system to
efi_file_read() and efi_file_close(), instead of the file handle for the
file we wish to read/close.

While this has worked up until now, it seems that it has only been by
pure luck. Olivier explains,

 "The issue is the UEFI Fat driver might return the same function for
  'fh->read()' and 'h->read()'. While in our case it does not work with
  a different implementation of EFI_SIMPLE_FILE_SYSTEM_PROTOCOL. In our
  case, we return a different pointer when reading a directory and
  reading a file."

Fixing this actually clears up the two functions because we can drop one
of the arguments, and instead only pass a file 'handle' argument.

Reported-by: Olivier Martin <olivier.martin@arm.com>
Reviewed-by: Olivier Martin <olivier.martin@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Cc: Leif Lindholm <leif.lindholm@linaro.org>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/boot/compressed/eboot.c       | 12 ++++++------
 drivers/firmware/efi/efi-stub-helper.c |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 91d17007323b..4703a6c4b8e3 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -167,31 +167,31 @@ efi_file_size(efi_system_table_t *sys_table, void *__fh,
 }
 
 static inline efi_status_t
-efi_file_read(void *__fh, void *handle, unsigned long *size, void *addr)
+efi_file_read(void *handle, unsigned long *size, void *addr)
 {
 	unsigned long func;
 
 	if (efi_early->is64) {
-		efi_file_handle_64_t *fh = __fh;
+		efi_file_handle_64_t *fh = handle;
 
 		func = (unsigned long)fh->read;
 		return efi_early->call(func, handle, size, addr);
 	} else {
-		efi_file_handle_32_t *fh = __fh;
+		efi_file_handle_32_t *fh = handle;
 
 		func = (unsigned long)fh->read;
 		return efi_early->call(func, handle, size, addr);
 	}
 }
 
-static inline efi_status_t efi_file_close(void *__fh, void *handle)
+static inline efi_status_t efi_file_close(void *handle)
 {
 	if (efi_early->is64) {
-		efi_file_handle_64_t *fh = __fh;
+		efi_file_handle_64_t *fh = handle;
 
 		return efi_early->call((unsigned long)fh->close, handle);
 	} else {
-		efi_file_handle_32_t *fh = __fh;
+		efi_file_handle_32_t *fh = handle;
 
 		return efi_early->call((unsigned long)fh->close, handle);
 	}
diff --git a/drivers/firmware/efi/efi-stub-helper.c b/drivers/firmware/efi/efi-stub-helper.c
index ff50aeebf0d9..2c41eaece2c1 100644
--- a/drivers/firmware/efi/efi-stub-helper.c
+++ b/drivers/firmware/efi/efi-stub-helper.c
@@ -397,7 +397,7 @@ static efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
 				else
 					chunksize = size;
 
-				status = efi_file_read(fh, files[j].handle,
+				status = efi_file_read(files[j].handle,
 						       &chunksize,
 						       (void *)addr);
 				if (status != EFI_SUCCESS) {
@@ -408,7 +408,7 @@ static efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
 				size -= chunksize;
 			}
 
-			efi_file_close(fh, files[j].handle);
+			efi_file_close(files[j].handle);
 		}
 
 	}
@@ -425,7 +425,7 @@ free_file_total:
 
 close_handles:
 	for (k = j; k < i; k++)
-		efi_file_close(fh, files[k].handle);
+		efi_file_close(files[k].handle);
 free_files:
 	efi_call_early(free_pool, files);
 fail:

From b3b42ac2cbae1f3cecbb6229964a4d48af31d382 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Sun, 16 Mar 2014 15:31:54 -0700
Subject: [PATCH 7/7] x86-64, modify_ldt: Ban 16-bit segments on 64-bit kernels

The IRET instruction, when returning to a 16-bit segment, only
restores the bottom 16 bits of the user space stack pointer.  We have
a software workaround for that ("espfix") for the 32-bit kernel, but
it relies on a nonzero stack segment base which is not available in
32-bit mode.

Since 16-bit support is somewhat crippled anyway on a 64-bit kernel
(no V86 mode), and most (if not quite all) 64-bit processors support
virtualization for the users who really need it, simply reject
attempts at creating a 16-bit segment when running on top of a 64-bit
kernel.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-kicdm89kzw9lldryb1br9od0@git.kernel.org
Cc: <stable@vger.kernel.org>
---
 arch/x86/kernel/ldt.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ebc987398923..af1d14a9ebda 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -229,6 +229,17 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 		}
 	}
 
+	/*
+	 * On x86-64 we do not support 16-bit segments due to
+	 * IRET leaking the high bits of the kernel stack address.
+	 */
+#ifdef CONFIG_X86_64
+	if (!ldt_info.seg_32bit) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+#endif
+
 	fill_ldt(&ldt, &ldt_info);
 	if (oldmode)
 		ldt.avl = 0;