From 542134c0375b5ca2b1d18490c02b8a20bfdd8d74 Mon Sep 17 00:00:00 2001
From: Eudean Sun <eudean@arista.com>
Date: Tue, 21 Nov 2017 10:43:24 -0800
Subject: [PATCH 001/305] HID: cp2112: Fix I2C_BLOCK_DATA transactions

The existing driver erroneously treats I2C_BLOCK_DATA and BLOCK_DATA
commands the same.

For I2C_BLOCK_DATA reads, the length of the read is provided in
data->block[0], but the length itself should not be sent to the slave. In
contrast, for BLOCK_DATA reads no length is specified since the length
will be the first byte returned from the slave. When copying data back
to the data buffer, for an I2C_BLOCK_DATA read we have to take care not to
overwrite data->block[0] to avoid overwriting the length. A BLOCK_DATA
read doesn't have this concern since the first byte returned by the device
is the length and belongs in data->block[0].

For I2C_BLOCK_DATA writes, the length is also provided in data->block[0],
but the length itself is not sent to the slave (in contrast to BLOCK_DATA
writes where the length prefixes the data sent to the slave).

This was tested on physical hardware using i2cdump with the i and s flags
to test the behavior of I2C_BLOCK_DATA reads and BLOCK_DATA reads,
respectively. Writes were not tested but the I2C_BLOCK_DATA write change
is pretty simple to verify by inspection.

Signed-off-by: Eudean Sun <eudean@arista.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-cp2112.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/hid/hid-cp2112.c b/drivers/hid/hid-cp2112.c
index 68cdc962265b..271f31461da4 100644
--- a/drivers/hid/hid-cp2112.c
+++ b/drivers/hid/hid-cp2112.c
@@ -696,8 +696,16 @@ static int cp2112_xfer(struct i2c_adapter *adap, u16 addr,
 					      (u8 *)&word, 2);
 		break;
 	case I2C_SMBUS_I2C_BLOCK_DATA:
-		size = I2C_SMBUS_BLOCK_DATA;
-		/* fallthrough */
+		if (read_write == I2C_SMBUS_READ) {
+			read_length = data->block[0];
+			count = cp2112_write_read_req(buf, addr, read_length,
+						      command, NULL, 0);
+		} else {
+			count = cp2112_write_req(buf, addr, command,
+						 data->block + 1,
+						 data->block[0]);
+		}
+		break;
 	case I2C_SMBUS_BLOCK_DATA:
 		if (I2C_SMBUS_READ == read_write) {
 			count = cp2112_write_read_req(buf, addr,
@@ -785,6 +793,9 @@ static int cp2112_xfer(struct i2c_adapter *adap, u16 addr,
 	case I2C_SMBUS_WORD_DATA:
 		data->word = le16_to_cpup((__le16 *)buf);
 		break;
+	case I2C_SMBUS_I2C_BLOCK_DATA:
+		memcpy(data->block + 1, buf, read_length);
+		break;
 	case I2C_SMBUS_BLOCK_DATA:
 		if (read_length > I2C_SMBUS_BLOCK_MAX) {
 			ret = -EPROTO;

From 5ddc3c656bfb5c90d0196ff72b908d0343fef85e Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Sat, 25 Nov 2017 15:48:32 -0800
Subject: [PATCH 002/305] Input: ims-pcu - fix typo in the error message

1. change "to" to "too".
2. move ")" to the front of "\n", which discovered by Joe Perches.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Joe Perches <joe@perches.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/misc/ims-pcu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/misc/ims-pcu.c b/drivers/input/misc/ims-pcu.c
index ae473123583b..3d51175c4d72 100644
--- a/drivers/input/misc/ims-pcu.c
+++ b/drivers/input/misc/ims-pcu.c
@@ -1651,7 +1651,7 @@ ims_pcu_get_cdc_union_desc(struct usb_interface *intf)
 				return union_desc;
 
 			dev_err(&intf->dev,
-				"Union descriptor to short (%d vs %zd\n)",
+				"Union descriptor too short (%d vs %zd)\n",
 				union_desc->bLength, sizeof(*union_desc));
 			return NULL;
 		}

From 10d900303f1c3a821eb0bef4e7b7ece16768fba4 Mon Sep 17 00:00:00 2001
From: Aaron Ma <aaron.ma@canonical.com>
Date: Sat, 25 Nov 2017 16:48:41 -0800
Subject: [PATCH 003/305] Input: elantech - add new icbody type 15

The touchpad of Lenovo Thinkpad L480 reports it's version as 15.

Cc: stable@vger.kernel.org
Signed-off-by: Aaron Ma <aaron.ma@canonical.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/mouse/elantech.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c
index b84cd978fce2..a4aaa748e987 100644
--- a/drivers/input/mouse/elantech.c
+++ b/drivers/input/mouse/elantech.c
@@ -1613,7 +1613,7 @@ static int elantech_set_properties(struct elantech_data *etd)
 		case 5:
 			etd->hw_version = 3;
 			break;
-		case 6 ... 14:
+		case 6 ... 15:
 			etd->hw_version = 4;
 			break;
 		default:

From bdfe4cebea11476d278b1b98dd0f7cdac8269d62 Mon Sep 17 00:00:00 2001
From: Icenowy Zheng <icenowy@aosc.io>
Date: Fri, 10 Nov 2017 17:26:54 +0800
Subject: [PATCH 004/305] arm64: allwinner: a64: add Ethernet PHY regulator for
 several boards

On several A64 boards the Ethernet PHY is powered by the DC1SW regulator
on the AXP803 PMIC.

Add phy-handle property to these boards' emac node.

Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
Acked-by: Corentin LABBE <clabbe.montjoie@gmail.com>
Tested-by: Corentin LABBE <clabbe.montjoie@gmail.com>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts     | 1 +
 arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts           | 1 +
 arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts | 1 +
 3 files changed, 3 insertions(+)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
index 45bdbfb96126..4a8d3f83a36e 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
@@ -75,6 +75,7 @@
 	pinctrl-0 = <&rgmii_pins>;
 	phy-mode = "rgmii";
 	phy-handle = <&ext_rgmii_phy>;
+	phy-supply = <&reg_dc1sw>;
 	status = "okay";
 };
 
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
index 806442d3e846..604cdaedac38 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
@@ -77,6 +77,7 @@
 	pinctrl-0 = <&rmii_pins>;
 	phy-mode = "rmii";
 	phy-handle = <&ext_rmii_phy1>;
+	phy-supply = <&reg_dc1sw>;
 	status = "okay";
 
 };
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
index 0eb2acedf8c3..a053a6ac5267 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
@@ -82,6 +82,7 @@
 	pinctrl-0 = <&rgmii_pins>;
 	phy-mode = "rgmii";
 	phy-handle = <&ext_rgmii_phy>;
+	phy-supply = <&reg_dc1sw>;
 	status = "okay";
 };
 

From 329b4130bc5eb2a1b123a652b985dbdb08d6b9a8 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <Alexey.Brodkin@synopsys.com>
Date: Thu, 23 Nov 2017 13:21:55 +0300
Subject: [PATCH 005/305] ARC: Fix detection of dual-issue enabled

As per PRM bit #0 ("D") in EXEC_CTRL enables dual-issue if set to 0,
otherwise if set to 1 all instructions are executed one at a time,
i.e. dual-issue is disabled.

Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c
index 7ef7d9a8ff89..9d27331fe69a 100644
--- a/arch/arc/kernel/setup.c
+++ b/arch/arc/kernel/setup.c
@@ -199,7 +199,7 @@ static void read_arc_build_cfg_regs(void)
 			unsigned int exec_ctrl;
 
 			READ_BCR(AUX_EXEC_CTRL, exec_ctrl);
-			cpu->extn.dual_enb = exec_ctrl & 1;
+			cpu->extn.dual_enb = !(exec_ctrl & 1);
 
 			/* dual issue always present for this core */
 			cpu->extn.dual = 1;

From da8df83957b179e5edc1029f637e5b69eff44967 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Wed, 29 Nov 2017 22:48:11 -0800
Subject: [PATCH 006/305] Input: joystick/analog - riscv has get_cycles()

Fixes:

drivers/input/joystick/analog.c:176:2: warning: #warning Precise timer not defined for this architecture. [-Wcpp]

Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/joystick/analog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 3d8ff09eba57..c868a878c84f 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -163,7 +163,7 @@ static unsigned int get_time_pit(void)
 #define GET_TIME(x)	do { x = (unsigned int)rdtsc(); } while (0)
 #define DELTA(x,y)	((y)-(x))
 #define TIME_NAME	"TSC"
-#elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)
+#elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV) || defined(CONFIG_TILE)
 #define GET_TIME(x)	do { x = get_cycles(); } while (0)
 #define DELTA(x,y)	((y)-(x))
 #define TIME_NAME	"get_cycles"

From 4c83c071b7849ca3e8072284a8587669d8ba6a3d Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 16 Nov 2017 16:09:29 -0800
Subject: [PATCH 007/305] Input: elants_i2c - do not clobber interrupt trigger
 on x86

This is similar to commit a4b0a58bb142 ("Input: elan_i2c - do not
clobber interrupt trigger on x86")

On x86 we historically used falling edge interrupts in the driver
because that's how first Chrome devices were configured. They also
did not use ACPI to enumerate I2C devices (because back then there
was no kernel support for that), so trigger was hard-coded in the
driver. However the controller behavior is much more reliable if
we use level triggers, and that is how we configured ARM devices,
and how want to configure newer x86 devices as well. All newer
x86 boxes have their I2C devices enumerated in ACPI.

Let's see if platform code (ACPI, DT) described interrupt and
specified particular trigger type, and if so, let's use it instead
of always clobbering trigger with IRQF_TRIGGER_FALLING. We will
still use this trigger type as a fallback if platform code left
interrupt trigger unconfigured.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/elants_i2c.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/input/touchscreen/elants_i2c.c b/drivers/input/touchscreen/elants_i2c.c
index e102d7764bc2..a458e5ec9e41 100644
--- a/drivers/input/touchscreen/elants_i2c.c
+++ b/drivers/input/touchscreen/elants_i2c.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/input.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/platform_device.h>
 #include <linux/async.h>
 #include <linux/i2c.h>
@@ -1261,10 +1262,13 @@ static int elants_i2c_probe(struct i2c_client *client,
 	}
 
 	/*
-	 * Systems using device tree should set up interrupt via DTS,
-	 * the rest will use the default falling edge interrupts.
+	 * Platform code (ACPI, DTS) should normally set up interrupt
+	 * for us, but in case it did not let's fall back to using falling
+	 * edge to be compatible with older Chromebooks.
 	 */
-	irqflags = client->dev.of_node ? 0 : IRQF_TRIGGER_FALLING;
+	irqflags = irq_get_trigger_type(client->irq);
+	if (!irqflags)
+		irqflags = IRQF_TRIGGER_FALLING;
 
 	error = devm_request_threaded_irq(&client->dev, client->irq,
 					  NULL, elants_i2c_irq,

From 56075f6072e7fdac302cff4e1b4c93b64ced99ab Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Sun, 26 Nov 2017 15:34:04 +1100
Subject: [PATCH 008/305] HID: holtekff: move MODULE_* parameters out of #ifdef
 block

If you compile with:
CONFIG_HID_HOLTEK=m
CONFIG_HOLTEK_FF is not set

You get the following warning:
WARNING: modpost: missing MODULE_LICENSE() in drivers/hid/hid-holtekff.o
see include/linux/module.h for more information

Fix this by moving the module info out of the #ifdef CONFIG_HOLTEK_FF
block and into the un-guarded part of the file.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Acked-by: Anssi Hannula <anssi.hannula@iki.fi>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-holtekff.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/hid/hid-holtekff.c b/drivers/hid/hid-holtekff.c
index 9325545fc3ae..edc0f64bb584 100644
--- a/drivers/hid/hid-holtekff.c
+++ b/drivers/hid/hid-holtekff.c
@@ -32,10 +32,6 @@
 
 #ifdef CONFIG_HOLTEK_FF
 
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Anssi Hannula <anssi.hannula@iki.fi>");
-MODULE_DESCRIPTION("Force feedback support for Holtek On Line Grip based devices");
-
 /*
  * These commands and parameters are currently known:
  *
@@ -223,3 +219,7 @@ static struct hid_driver holtek_driver = {
 	.probe = holtek_probe,
 };
 module_hid_driver(holtek_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Anssi Hannula <anssi.hannula@iki.fi>");
+MODULE_DESCRIPTION("Force feedback support for Holtek On Line Grip based devices");

From 741f5afbba70ff3cddcc5bba2595d9a44fa722e5 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Sat, 2 Dec 2017 17:36:45 +0100
Subject: [PATCH 009/305] ARM: dts: rockchip: add cpu0-regulator on
 rk3066a-marsboard

The rk3066 also has operating points now, but without adjusting
the cpu-regulator will break once higher voltages are needed for
a specific frequency, so add the needed cpu0-regulator.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 arch/arm/boot/dts/rk3066a-marsboard.dts | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm/boot/dts/rk3066a-marsboard.dts b/arch/arm/boot/dts/rk3066a-marsboard.dts
index c6d92c25df42..d23ee6d911ac 100644
--- a/arch/arm/boot/dts/rk3066a-marsboard.dts
+++ b/arch/arm/boot/dts/rk3066a-marsboard.dts
@@ -83,6 +83,10 @@
 	};
 };
 
+&cpu0 {
+	cpu0-supply = <&vdd_arm>;
+};
+
 &i2c1 {
 	status = "okay";
 	clock-frequency = <400000>;

From 912d7985f3cef1b901a4fd9fede549b919fe7ac3 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 9 Nov 2017 16:35:35 -0600
Subject: [PATCH 010/305] ARM: dts: rockchip: fix rk3288 iep-IOMMU interrupts
 property cells

The interrupts property in the iep-IOMMU node for the rk3288 dts file has a
spurious extra cell causing a dtc warning:

Warning (interrupts_property): interrupts size is (16), expected multiple of 12 in /iommu@ff900800

Remove the extra cell.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 arch/arm/boot/dts/rk3288.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index cd24894ee5c6..6102e4e7f35c 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -956,7 +956,7 @@
 	iep_mmu: iommu@ff900800 {
 		compatible = "rockchip,iommu";
 		reg = <0x0 0xff900800 0x0 0x40>;
-		interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH 0>;
+		interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
 		interrupt-names = "iep_mmu";
 		#iommu-cells = <0>;
 		status = "disabled";

From 3fa8c49f27c15df259b7b8f94eb126ae491893fd Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Mon, 4 Dec 2017 18:36:10 +0100
Subject: [PATCH 011/305] arm64: dts: rockchip: fix trailing 0 in rk3328 tsadc
 interrupts

Probably due to some copy-paste mistake, the tsadc of rk3328 ended up
with a 0 as 4th element that shouldn't be there, as interrupts on the
rk3328 only have multiples of 3, making dtc complain. So remove it.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 arch/arm64/boot/dts/rockchip/rk3328.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi
index 41d61840fb99..2426da631938 100644
--- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi
@@ -514,7 +514,7 @@
 	tsadc: tsadc@ff250000 {
 		compatible = "rockchip,rk3328-tsadc";
 		reg = <0x0 0xff250000 0x0 0x100>;
-		interrupts = <GIC_SPI 58 IRQ_TYPE_LEVEL_HIGH 0>;
+		interrupts = <GIC_SPI 58 IRQ_TYPE_LEVEL_HIGH>;
 		assigned-clocks = <&cru SCLK_TSADC>;
 		assigned-clock-rates = <50000>;
 		clocks = <&cru SCLK_TSADC>, <&cru PCLK_TSADC>;

From bc53e3aa88e8240823c1c440e6bab3c3a5ba5f59 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Mon, 27 Nov 2017 17:31:01 +0100
Subject: [PATCH 012/305] ARM: dts: at91: disable the nxp,se97b SMBUS timeout
 on the TSE-850

The I2C adapter driver is sometimes slow, causing the SCL line to
be stuck low for more than the stipulated SMBUS timeout of 25-35 ms.
This causes the client device to give up which in turn causes silent
corruption of data. So, disable the SMBUS timeout in the client device.

Signed-off-by: Peter Rosin <peda@axentia.se>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 arch/arm/boot/dts/at91-tse850-3.dts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/at91-tse850-3.dts b/arch/arm/boot/dts/at91-tse850-3.dts
index 5f29010cdbd8..9b82cc8843e1 100644
--- a/arch/arm/boot/dts/at91-tse850-3.dts
+++ b/arch/arm/boot/dts/at91-tse850-3.dts
@@ -221,6 +221,7 @@
 	jc42@18 {
 		compatible = "nxp,se97b", "jedec,jc-42.4-temp";
 		reg = <0x18>;
+		smbus-timeout-disable;
 	};
 
 	dpot: mcp4651-104@28 {

From e2bf801ecd4e62222a46d1ba9e57e710171d29c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Br=C3=BCns?= <stefan.bruens@rwth-aachen.de>
Date: Mon, 27 Nov 2017 20:05:34 +0100
Subject: [PATCH 013/305] sunxi-rsb: Include OF based modalias in device uevent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include the OF-based modalias in the uevent sent when registering devices
on the sunxi RSB bus, so that user space has a chance to autoload the
kernel module for the device.

Fixes a regression caused by commit 3f241bfa60bd ("arm64: allwinner: a64:
pine64: Use dcdc1 regulator for mmc0"). When the axp20x-rsb module for
the AXP803 PMIC is built as a module, it is not loaded and the system
ends up with an disfunctional MMC controller.

Fixes: d787dcdb9c8f ("bus: sunxi-rsb: Add driver for Allwinner Reduced Serial Bus")
Cc: stable <stable@vger.kernel.org> # 4.4.x 7a3b7cd332db of: device: Export of_device_{get_modalias, uvent_modalias} to modules
Acked-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Stefan Brüns <stefan.bruens@rwth-aachen.de>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 drivers/bus/sunxi-rsb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/bus/sunxi-rsb.c b/drivers/bus/sunxi-rsb.c
index 328ca93781cf..1b76d9585902 100644
--- a/drivers/bus/sunxi-rsb.c
+++ b/drivers/bus/sunxi-rsb.c
@@ -178,6 +178,7 @@ static struct bus_type sunxi_rsb_bus = {
 	.match		= sunxi_rsb_device_match,
 	.probe		= sunxi_rsb_device_probe,
 	.remove		= sunxi_rsb_device_remove,
+	.uevent		= of_device_uevent_modalias,
 };
 
 static void sunxi_rsb_dev_release(struct device *dev)

From e17e237cd69f9f6ecaa0e875f889ad401a625148 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 4 Dec 2017 16:44:01 +0800
Subject: [PATCH 014/305] ARM: dts: sunxi: Convert to CCU index macros for HDMI
 controller

When the HDMI controller device node was added, the needed PLL clock
macros were not exported. A separate patch addresses that, but it is
merged through a different tree.

Now that both patches are in mainline proper, we can convert the raw
numbers to proper macros.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 arch/arm/boot/dts/sun4i-a10.dtsi  | 4 ++--
 arch/arm/boot/dts/sun5i-a10s.dtsi | 4 ++--
 arch/arm/boot/dts/sun6i-a31.dtsi  | 4 ++--
 arch/arm/boot/dts/sun7i-a20.dtsi  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm/boot/dts/sun4i-a10.dtsi b/arch/arm/boot/dts/sun4i-a10.dtsi
index b91300d49a31..5840f5c75c3b 100644
--- a/arch/arm/boot/dts/sun4i-a10.dtsi
+++ b/arch/arm/boot/dts/sun4i-a10.dtsi
@@ -502,8 +502,8 @@
 			reg = <0x01c16000 0x1000>;
 			interrupts = <58>;
 			clocks = <&ccu CLK_AHB_HDMI0>, <&ccu CLK_HDMI>,
-				 <&ccu 9>,
-				 <&ccu 18>;
+				 <&ccu CLK_PLL_VIDEO0_2X>,
+				 <&ccu CLK_PLL_VIDEO1_2X>;
 			clock-names = "ahb", "mod", "pll-0", "pll-1";
 			dmas = <&dma SUN4I_DMA_NORMAL 16>,
 			       <&dma SUN4I_DMA_NORMAL 16>,
diff --git a/arch/arm/boot/dts/sun5i-a10s.dtsi b/arch/arm/boot/dts/sun5i-a10s.dtsi
index 6ae4d95e230e..316cb8b2945b 100644
--- a/arch/arm/boot/dts/sun5i-a10s.dtsi
+++ b/arch/arm/boot/dts/sun5i-a10s.dtsi
@@ -82,8 +82,8 @@
 			reg = <0x01c16000 0x1000>;
 			interrupts = <58>;
 			clocks = <&ccu CLK_AHB_HDMI>, <&ccu CLK_HDMI>,
-				 <&ccu 9>,
-				 <&ccu 16>;
+				 <&ccu CLK_PLL_VIDEO0_2X>,
+				 <&ccu CLK_PLL_VIDEO1_2X>;
 			clock-names = "ahb", "mod", "pll-0", "pll-1";
 			dmas = <&dma SUN4I_DMA_NORMAL 16>,
 			       <&dma SUN4I_DMA_NORMAL 16>,
diff --git a/arch/arm/boot/dts/sun6i-a31.dtsi b/arch/arm/boot/dts/sun6i-a31.dtsi
index 8bfa12b548e0..72d3fe44ecaf 100644
--- a/arch/arm/boot/dts/sun6i-a31.dtsi
+++ b/arch/arm/boot/dts/sun6i-a31.dtsi
@@ -429,8 +429,8 @@
 			interrupts = <GIC_SPI 88 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_AHB1_HDMI>, <&ccu CLK_HDMI>,
 				 <&ccu CLK_HDMI_DDC>,
-				 <&ccu 7>,
-				 <&ccu 13>;
+				 <&ccu CLK_PLL_VIDEO0_2X>,
+				 <&ccu CLK_PLL_VIDEO1_2X>;
 			clock-names = "ahb", "mod", "ddc", "pll-0", "pll-1";
 			resets = <&ccu RST_AHB1_HDMI>;
 			reset-names = "ahb";
diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi
index 68dfa82544fc..59655e42e4b0 100644
--- a/arch/arm/boot/dts/sun7i-a20.dtsi
+++ b/arch/arm/boot/dts/sun7i-a20.dtsi
@@ -581,8 +581,8 @@
 			reg = <0x01c16000 0x1000>;
 			interrupts = <GIC_SPI 58 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_AHB_HDMI0>, <&ccu CLK_HDMI>,
-				 <&ccu 9>,
-				 <&ccu 18>;
+				 <&ccu CLK_PLL_VIDEO0_2X>,
+				 <&ccu CLK_PLL_VIDEO1_2X>;
 			clock-names = "ahb", "mod", "pll-0", "pll-1";
 			dmas = <&dma SUN4I_DMA_NORMAL 16>,
 			       <&dma SUN4I_DMA_NORMAL 16>,

From 7d556bfc49adddf2beb0d16c91945c3b8b783282 Mon Sep 17 00:00:00 2001
From: Jagan Teki <jagannadh.teki@gmail.com>
Date: Mon, 4 Dec 2017 10:23:07 +0530
Subject: [PATCH 015/305] arm64: allwinner: a64-sopine: Fix to use dcdc1
 regulator instead of vcc3v3

Since current tree support AXP803 regulators,
replace fixed regulator vcc3v3 with AXP803 dcdc1 regulator where ever
it need to replace.

Tested mmc0 on sopine baseboard.

Signed-off-by: Jagan Teki <jagan@amarulasolutions.com>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 .../dts/allwinner/sun50i-a64-sopine-baseboard.dts     |  2 +-
 arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi  | 11 +----------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
index a053a6ac5267..abe179de35d7 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
@@ -96,7 +96,7 @@
 &mmc2 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc2_pins>;
-	vmmc-supply = <&reg_vcc3v3>;
+	vmmc-supply = <&reg_dcdc1>;
 	vqmmc-supply = <&reg_vcc1v8>;
 	bus-width = <8>;
 	non-removable;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi
index a5da18a6f286..43418bd881d8 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi
@@ -45,19 +45,10 @@
 
 #include "sun50i-a64.dtsi"
 
-/ {
-	reg_vcc3v3: vcc3v3 {
-		compatible = "regulator-fixed";
-		regulator-name = "vcc3v3";
-		regulator-min-microvolt = <3300000>;
-		regulator-max-microvolt = <3300000>;
-	};
-};
-
 &mmc0 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc0_pins>;
-	vmmc-supply = <&reg_vcc3v3>;
+	vmmc-supply = <&reg_dcdc1>;
 	non-removable;
 	disable-wp;
 	bus-width = <4>;

From f88e9301948173dd35afad4a6939092c7f269aed Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <geomatsi@gmail.com>
Date: Fri, 3 Nov 2017 22:58:54 +0300
Subject: [PATCH 016/305] arm64: dts: orange-pi-zero-plus2: fix sdcard detect

The sdcard detect pin on orange-pi-zero-plus2 is pulled up.
Fix cd-gpio description to enable sdcard detect.

Signed-off-by: Sergey Matyukevich <geomatsi@gmail.com>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts
index b6b7a561df8c..a42fd79a62a3 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts
@@ -71,7 +71,7 @@
 	pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
 	vmmc-supply = <&reg_vcc3v3>;
 	bus-width = <4>;
-	cd-gpios = <&pio 5 6 GPIO_ACTIVE_HIGH>;
+	cd-gpios = <&pio 5 6 GPIO_ACTIVE_LOW>;
 	status = "okay";
 };
 

From 87eba0716011e528f7841026f2cc65683219d0ad Mon Sep 17 00:00:00 2001
From: Klaus Goger <klaus.goger@theobroma-systems.com>
Date: Tue, 5 Dec 2017 08:11:58 +0100
Subject: [PATCH 017/305] arm64: dts: rockchip: remove vdd_log from rk3399-puma

vdd_log has no consumer and therefore will not be set to a specific
voltage. Still the PWM output pin gets configured and thence the vdd_log
output voltage will changed from it's default. Depending on the idle
state of the PWM this will slightly over or undervoltage the logic supply
of the RK3399 and cause instability with GbE (undervoltage) and PCIe
(overvoltage). Since the default value set by a voltage divider is the
correct supply voltage and we don't need to change it during runtime we
remove the rail from the devicetree completely so the PWM pin will not
be configured.

Signed-off-by: Klaus Goger <klaus.goger@theobroma-systems.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi
index 910628d18add..1fc5060d7027 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi
@@ -155,17 +155,6 @@
 		regulator-min-microvolt = <5000000>;
 		regulator-max-microvolt = <5000000>;
 	};
-
-	vdd_log: vdd-log {
-		compatible = "pwm-regulator";
-		pwms = <&pwm2 0 25000 0>;
-		regulator-name = "vdd_log";
-		regulator-min-microvolt = <800000>;
-		regulator-max-microvolt = <1400000>;
-		regulator-always-on;
-		regulator-boot-on;
-		status = "okay";
-	};
 };
 
 &cpu_b0 {

From bc631943faba6fc3f755748091ada31798fb7d50 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Wed, 6 Dec 2017 01:10:05 +0100
Subject: [PATCH 018/305] arm64: dts: rockchip: limit rk3328-rock64 gmac speed
 to 100MBit for now

It looks like either the current kernel or the hardware has reliability
issues when the gmac is actually running at 1GBit. In my test-case
it is not able to boot on a nfsroot at this speed, as the system
will always lose the connection to the nfs-server during boot, before
reaching any login prompt and not recover from this.

So until this is solved, limit the speed to 100MBit as with this the
nfsroot survives stress tests like an apt-get upgrade without problems.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 arch/arm64/boot/dts/rockchip/rk3328-rock64.dts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
index d4f80786e7c2..3890468678ce 100644
--- a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
@@ -132,6 +132,8 @@
 	assigned-clocks = <&cru SCLK_MAC2IO>, <&cru SCLK_MAC2IO_EXT>;
 	assigned-clock-parents = <&gmac_clkin>, <&gmac_clkin>;
 	clock_in_out = "input";
+	/* shows instability at 1GBit right now */
+	max-speed = <100>;
 	phy-supply = <&vcc_io>;
 	phy-mode = "rgmii";
 	pinctrl-names = "default";

From b638823a7bbd251d442042b0e9522100bdaa5b66 Mon Sep 17 00:00:00 2001
From: Alejandro Mery <amery@hanoverdisplays.com>
Date: Tue, 5 Dec 2017 12:34:56 +0000
Subject: [PATCH 019/305] ARM: davinci: Use platform_device_register_full() to
 create pdev for dm365's eDMA

Convert the DM365 EDMA platform device creation to use
struct platform_device_info XXXXXX __initconst and
platform_device_register_full()

This will allow us to specify the dma_mask for the device
in an upcoming patch. Without this, EDMA on DM365 refuses
to probe.

Fixes: 7ab388e85faa ("ARM: davinci: Use platform_device_register_full() to create pdev for eDMA")
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Alejandro Mery <amery@hanoverdisplays.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/dm365.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c
index 8be04ec95adf..9bd17bc77b5c 100644
--- a/arch/arm/mach-davinci/dm365.c
+++ b/arch/arm/mach-davinci/dm365.c
@@ -925,12 +925,13 @@ static struct resource edma_resources[] = {
 	/* not using TC*_ERR */
 };
 
-static struct platform_device dm365_edma_device = {
-	.name			= "edma",
-	.id			= 0,
-	.dev.platform_data	= &dm365_edma_pdata,
-	.num_resources		= ARRAY_SIZE(edma_resources),
-	.resource		= edma_resources,
+static const struct platform_device_info dm365_edma_device __initconst = {
+	.name		= "edma",
+	.id		= 0,
+	.res		= edma_resources,
+	.num_res	= ARRAY_SIZE(edma_resources),
+	.data		= &dm365_edma_pdata,
+	.size_data	= sizeof(dm365_edma_pdata),
 };
 
 static struct resource dm365_asp_resources[] = {
@@ -1428,13 +1429,18 @@ int __init dm365_init_video(struct vpfe_config *vpfe_cfg,
 
 static int __init dm365_init_devices(void)
 {
+	struct platform_device *edma_pdev;
 	int ret = 0;
 
 	if (!cpu_is_davinci_dm365())
 		return 0;
 
 	davinci_cfg_reg(DM365_INT_EDMA_CC);
-	platform_device_register(&dm365_edma_device);
+	edma_pdev = platform_device_register_full(&dm365_edma_device);
+	if (IS_ERR(edma_pdev)) {
+		pr_warn("%s: Failed to register eDMA\n", __func__);
+		return PTR_ERR(edma_pdev);
+	}
 
 	platform_device_register(&dm365_mdio_device);
 	platform_device_register(&dm365_emac_device);

From 621f96bcb49412010876a1e6e006f748b91d9e75 Mon Sep 17 00:00:00 2001
From: Alejandro Mery <amery@hanoverdisplays.com>
Date: Tue, 5 Dec 2017 12:34:57 +0000
Subject: [PATCH 020/305] ARM: davinci: Add dma_mask to dm365's eDMA device

Add dma_mask to dm365's EDMA device.

Without a valid dma_mask, EDMA on DM365 refuses to
probe.

Fixes: cef5b0da4019 ("ARM: davinci: Add dma_mask to eDMA devices")
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Alejandro Mery <amery@hanoverdisplays.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/dm365.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c
index 9bd17bc77b5c..103316f01a22 100644
--- a/arch/arm/mach-davinci/dm365.c
+++ b/arch/arm/mach-davinci/dm365.c
@@ -928,6 +928,7 @@ static struct resource edma_resources[] = {
 static const struct platform_device_info dm365_edma_device __initconst = {
 	.name		= "edma",
 	.id		= 0,
+	.dma_mask	= DMA_BIT_MASK(32),
 	.res		= edma_resources,
 	.num_res	= ARRAY_SIZE(edma_resources),
 	.data		= &dm365_edma_pdata,

From c5a88cd2e1c508868922bafa0a5c3365986b98e5 Mon Sep 17 00:00:00 2001
From: David Lechner <david@lechnology.com>
Date: Sun, 3 Dec 2017 16:04:53 -0600
Subject: [PATCH 021/305] ARM: dts: da850-lego-ev3: Fix battery voltage gpio

This fixes the battery voltage monitoring gpio-hog settings.

When the gpio is low, it turns off the battery voltage to the ADC chip.
However, this needs to be on all of the time so that we can monitor
battery voltage.

Also, there was a typo that prevented pinmuxing from working correctly.

Signed-off-by: David Lechner <david@lechnology.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/boot/dts/da850-lego-ev3.dts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/da850-lego-ev3.dts b/arch/arm/boot/dts/da850-lego-ev3.dts
index 413dbd5d9f64..81942ae83e1f 100644
--- a/arch/arm/boot/dts/da850-lego-ev3.dts
+++ b/arch/arm/boot/dts/da850-lego-ev3.dts
@@ -178,7 +178,7 @@
 	 */
 	battery {
 		pinctrl-names = "default";
-		pintctrl-0 = <&battery_pins>;
+		pinctrl-0 = <&battery_pins>;
 		compatible = "lego,ev3-battery";
 		io-channels = <&adc 4>, <&adc 3>;
 		io-channel-names = "voltage", "current";
@@ -392,7 +392,7 @@
 	batt_volt_en {
 		gpio-hog;
 		gpios = <6 GPIO_ACTIVE_HIGH>;
-		output-low;
+		output-high;
 	};
 };
 

From 7cb4774e2d3282d29edd00762167876a27cc7d2a Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 6 Dec 2017 17:54:38 +0100
Subject: [PATCH 022/305] HID: core: lower log level for unknown main item tags
 to warnings

Given all the effort distros have done with splash-screens to give
users a nice clean boot experience, we really want dmesg --level=err
to not print anything unless there is a real problem with either the
hardware or the kernel. Buggy HID descriptors unfortunately happen
all too often, so lower the log level to warning keep the console
clear of error messages such as:

[  441.079664] apple 0005:05AC:0239.0003: unknown main item tag 0x0

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index f3fcb836a1f9..0c3f608131cf 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -551,7 +551,7 @@ static int hid_parser_main(struct hid_parser *parser, struct hid_item *item)
 		ret = hid_add_field(parser, HID_FEATURE_REPORT, data);
 		break;
 	default:
-		hid_err(parser->device, "unknown main item tag 0x%x\n", item->tag);
+		hid_warn(parser->device, "unknown main item tag 0x%x\n", item->tag);
 		ret = 0;
 	}
 

From 451df7d110b82998c04a80d0de0f1e79aaa7792a Mon Sep 17 00:00:00 2001
From: Alejandro Mery <amery@hanoverdisplays.com>
Date: Fri, 8 Dec 2017 10:35:58 +0000
Subject: [PATCH 023/305] ARM: davinci: fix mmc entries in dm365's
 dma_slave_map

fix mmc entries in dm365's dma_slave_map to match the actual device names

Fixes: 0c750e1fe481 ("ARM: davinci: dm365: Add dma_slave_map to edma")
Signed-off-by: Alejandro Mery <amery@hanoverdisplays.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/dm365.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c
index 103316f01a22..5ace9380626a 100644
--- a/arch/arm/mach-davinci/dm365.c
+++ b/arch/arm/mach-davinci/dm365.c
@@ -868,10 +868,10 @@ static const struct dma_slave_map dm365_edma_map[] = {
 	{ "spi_davinci.0", "rx", EDMA_FILTER_PARAM(0, 17) },
 	{ "spi_davinci.3", "tx", EDMA_FILTER_PARAM(0, 18) },
 	{ "spi_davinci.3", "rx", EDMA_FILTER_PARAM(0, 19) },
-	{ "dm6441-mmc.0", "rx", EDMA_FILTER_PARAM(0, 26) },
-	{ "dm6441-mmc.0", "tx", EDMA_FILTER_PARAM(0, 27) },
-	{ "dm6441-mmc.1", "rx", EDMA_FILTER_PARAM(0, 30) },
-	{ "dm6441-mmc.1", "tx", EDMA_FILTER_PARAM(0, 31) },
+	{ "da830-mmc.0", "rx", EDMA_FILTER_PARAM(0, 26) },
+	{ "da830-mmc.0", "tx", EDMA_FILTER_PARAM(0, 27) },
+	{ "da830-mmc.1", "rx", EDMA_FILTER_PARAM(0, 30) },
+	{ "da830-mmc.1", "tx", EDMA_FILTER_PARAM(0, 31) },
 };
 
 static struct edma_soc_info dm365_edma_pdata = {

From 33cd3c07a976e11c3c4cc6b0b3db6760ad1590c5 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Dec 2017 12:16:22 +0000
Subject: [PATCH 024/305] drm/armada: fix leak of crtc structure

Fix the leak of the CRTC structure in the failure paths of
armada_drm_crtc_create().

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/gpu/drm/armada/armada_crtc.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/armada/armada_crtc.c b/drivers/gpu/drm/armada/armada_crtc.c
index 2e065facdce7..50a3a97b4289 100644
--- a/drivers/gpu/drm/armada/armada_crtc.c
+++ b/drivers/gpu/drm/armada/armada_crtc.c
@@ -1225,17 +1225,13 @@ static int armada_drm_crtc_create(struct drm_device *drm, struct device *dev,
 
 	ret = devm_request_irq(dev, irq, armada_drm_irq, 0, "armada_drm_crtc",
 			       dcrtc);
-	if (ret < 0) {
-		kfree(dcrtc);
-		return ret;
-	}
+	if (ret < 0)
+		goto err_crtc;
 
 	if (dcrtc->variant->init) {
 		ret = dcrtc->variant->init(dcrtc, dev);
-		if (ret) {
-			kfree(dcrtc);
-			return ret;
-		}
+		if (ret)
+			goto err_crtc;
 	}
 
 	/* Ensure AXI pipeline is enabled */
@@ -1246,13 +1242,15 @@ static int armada_drm_crtc_create(struct drm_device *drm, struct device *dev,
 	dcrtc->crtc.port = port;
 
 	primary = kzalloc(sizeof(*primary), GFP_KERNEL);
-	if (!primary)
-		return -ENOMEM;
+	if (!primary) {
+		ret = -ENOMEM;
+		goto err_crtc;
+	}
 
 	ret = armada_drm_plane_init(primary);
 	if (ret) {
 		kfree(primary);
-		return ret;
+		goto err_crtc;
 	}
 
 	ret = drm_universal_plane_init(drm, &primary->base, 0,
@@ -1263,7 +1261,7 @@ static int armada_drm_crtc_create(struct drm_device *drm, struct device *dev,
 				       DRM_PLANE_TYPE_PRIMARY, NULL);
 	if (ret) {
 		kfree(primary);
-		return ret;
+		goto err_crtc;
 	}
 
 	ret = drm_crtc_init_with_planes(drm, &dcrtc->crtc, &primary->base, NULL,
@@ -1282,6 +1280,9 @@ static int armada_drm_crtc_create(struct drm_device *drm, struct device *dev,
 
 err_crtc_init:
 	primary->base.funcs->destroy(&primary->base);
+err_crtc:
+	kfree(dcrtc);
+
 	return ret;
 }
 

From 2bf57436d52b241044133fb0e2c7fd8320c6b02e Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Dec 2017 12:16:22 +0000
Subject: [PATCH 025/305] drm/armada: fix SRAM powerdown

Avoid powering down the overlay SRAM banks when disabling the primary
plane, thereby masking any overlay video.  This feature is supposed to
allow us to cut the bandwidth required while displaying full-frame
overlay video.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/gpu/drm/armada/armada_crtc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/armada/armada_crtc.c b/drivers/gpu/drm/armada/armada_crtc.c
index 50a3a97b4289..400a133c0576 100644
--- a/drivers/gpu/drm/armada/armada_crtc.c
+++ b/drivers/gpu/drm/armada/armada_crtc.c
@@ -744,15 +744,14 @@ void armada_drm_crtc_plane_disable(struct armada_crtc *dcrtc,
 	if (plane->fb)
 		drm_framebuffer_put(plane->fb);
 
-	/* Power down the Y/U/V FIFOs */
-	sram_para1 = CFG_PDWN16x66 | CFG_PDWN32x66;
-
 	/* Power down most RAMs and FIFOs if this is the primary plane */
 	if (plane->type == DRM_PLANE_TYPE_PRIMARY) {
-		sram_para1 |= CFG_PDWN256x32 | CFG_PDWN256x24 | CFG_PDWN256x8 |
-			      CFG_PDWN32x32 | CFG_PDWN64x66;
+		sram_para1 = CFG_PDWN256x32 | CFG_PDWN256x24 | CFG_PDWN256x8 |
+			     CFG_PDWN32x32 | CFG_PDWN64x66;
 		dma_ctrl0_mask = CFG_GRA_ENA;
 	} else {
+		/* Power down the Y/U/V FIFOs */
+		sram_para1 = CFG_PDWN16x66 | CFG_PDWN32x66;
 		dma_ctrl0_mask = CFG_DMA_ENA;
 	}
 

From 9c898c495490b129bd4445630e3c6641e8389fc8 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Dec 2017 12:16:22 +0000
Subject: [PATCH 026/305] drm/armada: fix UV swap code

The UV swap code was not always programming things correctly when
the source origin box has been offset.  Fix this.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/gpu/drm/armada/armada_crtc.h    |  2 ++
 drivers/gpu/drm/armada/armada_overlay.c | 38 ++++++++++++-------------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/armada/armada_crtc.h b/drivers/gpu/drm/armada/armada_crtc.h
index bab11f483575..bfd3514fbe9b 100644
--- a/drivers/gpu/drm/armada/armada_crtc.h
+++ b/drivers/gpu/drm/armada/armada_crtc.h
@@ -42,6 +42,8 @@ struct armada_plane_work {
 };
 
 struct armada_plane_state {
+	u16 src_x;
+	u16 src_y;
 	u32 src_hw;
 	u32 dst_hw;
 	u32 dst_yx;
diff --git a/drivers/gpu/drm/armada/armada_overlay.c b/drivers/gpu/drm/armada/armada_overlay.c
index b411b608821a..aba947696178 100644
--- a/drivers/gpu/drm/armada/armada_overlay.c
+++ b/drivers/gpu/drm/armada/armada_overlay.c
@@ -99,6 +99,7 @@ armada_ovl_plane_update(struct drm_plane *plane, struct drm_crtc *crtc,
 {
 	struct armada_ovl_plane *dplane = drm_to_armada_ovl_plane(plane);
 	struct armada_crtc *dcrtc = drm_to_armada_crtc(crtc);
+	const struct drm_format_info *format;
 	struct drm_rect src = {
 		.x1 = src_x,
 		.y1 = src_y,
@@ -117,7 +118,7 @@ armada_ovl_plane_update(struct drm_plane *plane, struct drm_crtc *crtc,
 	};
 	uint32_t val, ctrl0;
 	unsigned idx = 0;
-	bool visible;
+	bool visible, fb_changed;
 	int ret;
 
 	trace_armada_ovl_plane_update(plane, crtc, fb,
@@ -138,6 +139,18 @@ armada_ovl_plane_update(struct drm_plane *plane, struct drm_crtc *crtc,
 	if (!visible)
 		ctrl0 &= ~CFG_DMA_ENA;
 
+	/*
+	 * Shifting a YUV packed format image by one pixel causes the U/V
+	 * planes to swap.  Compensate for it by also toggling the UV swap.
+	 */
+	format = fb->format;
+	if (format->num_planes == 1 && src.x1 >> 16 & (format->hsub - 1))
+		ctrl0 ^= CFG_DMA_MOD(CFG_SWAPUV);
+
+	fb_changed = plane->fb != fb ||
+		     dplane->base.state.src_x != src.x1 >> 16 ||
+	             dplane->base.state.src_y != src.y1 >> 16;
+
 	if (!dcrtc->plane) {
 		dcrtc->plane = plane;
 		armada_ovl_update_attr(&dplane->prop, dcrtc);
@@ -145,7 +158,7 @@ armada_ovl_plane_update(struct drm_plane *plane, struct drm_crtc *crtc,
 
 	/* FIXME: overlay on an interlaced display */
 	/* Just updating the position/size? */
-	if (plane->fb == fb && dplane->base.state.ctrl0 == ctrl0) {
+	if (!fb_changed && dplane->base.state.ctrl0 == ctrl0) {
 		val = (drm_rect_height(&src) & 0xffff0000) |
 		      drm_rect_width(&src) >> 16;
 		dplane->base.state.src_hw = val;
@@ -169,9 +182,8 @@ armada_ovl_plane_update(struct drm_plane *plane, struct drm_crtc *crtc,
 	if (armada_drm_plane_work_wait(&dplane->base, HZ / 25) == 0)
 		armada_drm_plane_work_cancel(dcrtc, &dplane->base);
 
-	if (plane->fb != fb) {
-		u32 addrs[3], pixel_format;
-		int num_planes, hsub;
+	if (fb_changed) {
+		u32 addrs[3];
 
 		/*
 		 * Take a reference on the new framebuffer - we want to
@@ -182,23 +194,11 @@ armada_ovl_plane_update(struct drm_plane *plane, struct drm_crtc *crtc,
 		if (plane->fb)
 			armada_ovl_retire_fb(dplane, plane->fb);
 
-		src_y = src.y1 >> 16;
-		src_x = src.x1 >> 16;
+		dplane->base.state.src_y = src_y = src.y1 >> 16;
+		dplane->base.state.src_x = src_x = src.x1 >> 16;
 
 		armada_drm_plane_calc_addrs(addrs, fb, src_x, src_y);
 
-		pixel_format = fb->format->format;
-		hsub = drm_format_horz_chroma_subsampling(pixel_format);
-		num_planes = fb->format->num_planes;
-
-		/*
-		 * Annoyingly, shifting a YUYV-format image by one pixel
-		 * causes the U/V planes to toggle.  Toggle the UV swap.
-		 * (Unfortunately, this causes momentary colour flickering.)
-		 */
-		if (src_x & (hsub - 1) && num_planes == 1)
-			ctrl0 ^= CFG_DMA_MOD(CFG_SWAPUV);
-
 		armada_reg_queue_set(dplane->vbl.regs, idx, addrs[0],
 				     LCD_SPU_DMA_START_ADDR_Y0);
 		armada_reg_queue_set(dplane->vbl.regs, idx, addrs[1],

From d6a48965db3d5f9b524ebfdd8c1fe3a4175d8e35 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Dec 2017 12:16:22 +0000
Subject: [PATCH 027/305] drm/armada: improve efficiency of
 armada_drm_plane_calc_addrs()

Lookup the drm_format_info structure once when computing all the
framebuffer plane addresses by using drm_format_info(), rather than
repetitive lookups via drm_format_plane_cpp().

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/gpu/drm/armada/armada_crtc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/armada/armada_crtc.c b/drivers/gpu/drm/armada/armada_crtc.c
index 400a133c0576..7f7b3e738679 100644
--- a/drivers/gpu/drm/armada/armada_crtc.c
+++ b/drivers/gpu/drm/armada/armada_crtc.c
@@ -168,8 +168,9 @@ static void armada_drm_crtc_update(struct armada_crtc *dcrtc)
 void armada_drm_plane_calc_addrs(u32 *addrs, struct drm_framebuffer *fb,
 	int x, int y)
 {
+	const struct drm_format_info *format = fb->format;
+	unsigned int num_planes = format->num_planes;
 	u32 addr = drm_fb_obj(fb)->dev_addr;
-	int num_planes = fb->format->num_planes;
 	int i;
 
 	if (num_planes > 3)
@@ -177,7 +178,7 @@ void armada_drm_plane_calc_addrs(u32 *addrs, struct drm_framebuffer *fb,
 
 	for (i = 0; i < num_planes; i++)
 		addrs[i] = addr + fb->offsets[i] + y * fb->pitches[i] +
-			     x * fb->format->cpp[i];
+			     x * format->cpp[i];
 	for (; i < 3; i++)
 		addrs[i] = 0;
 }

From de0ea9ad2f548dd9e555cac27cf7ade1db5b26ea Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Dec 2017 12:16:22 +0000
Subject: [PATCH 028/305] drm/armada: fix YUV planar format framebuffer offsets

We weren't correctly calculating the YUV planar offsets for subsampled
chroma planes correctly - fix up the coordinates for planes 1 and 2.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/gpu/drm/armada/armada_crtc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/armada/armada_crtc.c b/drivers/gpu/drm/armada/armada_crtc.c
index 7f7b3e738679..a0f4d2a2a481 100644
--- a/drivers/gpu/drm/armada/armada_crtc.c
+++ b/drivers/gpu/drm/armada/armada_crtc.c
@@ -176,7 +176,13 @@ void armada_drm_plane_calc_addrs(u32 *addrs, struct drm_framebuffer *fb,
 	if (num_planes > 3)
 		num_planes = 3;
 
-	for (i = 0; i < num_planes; i++)
+	addrs[0] = addr + fb->offsets[0] + y * fb->pitches[0] +
+		   x * format->cpp[0];
+
+	y /= format->vsub;
+	x /= format->hsub;
+
+	for (i = 1; i < num_planes; i++)
 		addrs[i] = addr + fb->offsets[i] + y * fb->pitches[i] +
 			     x * format->cpp[i];
 	for (; i < 3; i++)

From e7fd37ba12170cc414be8b639dfc2c5f7172fac2 Mon Sep 17 00:00:00 2001
From: Ma Shimiao <mashimiao.fnst@cn.fujitsu.com>
Date: Tue, 12 Dec 2017 09:43:49 +0800
Subject: [PATCH 029/305] cgroup: avoid copying strings longer than the buffers

cgroup root name and file name have max length limit, we should
avoid copying longer name than that to the name.

tj: minor update to $SUBJ.

Signed-off-by: Ma Shimiao <mashimiao.fnst@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe147f24..18d71fbd3923 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
 			 cft->name);
 	else
-		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+		strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
 }
 
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
 
 	root->flags = opts->flags;
 	if (opts->release_agent)
-		strcpy(root->release_agent_path, opts->release_agent);
+		strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
 	if (opts->name)
-		strcpy(root->name, opts->name);
+		strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
 	if (opts->cpuset_clone_children)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }

From 964728f9f407eca0b417fdf8e784b7a76979490c Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 13 Nov 2017 11:12:58 +0100
Subject: [PATCH 030/305] USB: chipidea: msm: fix ulpi-node lookup

Fix child-node lookup during probe, which ended up searching the whole
device tree depth-first starting at the parent rather than just matching
on its children.

Note that the original premature free of the parent node has already
been fixed separately, but that fix was apparently never backported to
stable.

Fixes: 47654a162081 ("usb: chipidea: msm: Restore wrapper settings after reset")
Fixes: b74c43156c0c ("usb: chipidea: msm: ci_hdrc_msm_probe() missing of_node_get()")
Cc: stable <stable@vger.kernel.org>     # 4.10: b74c43156c0c
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Cc: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/usb/chipidea/ci_hdrc_msm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c
index 3593ce0ec641..880009987460 100644
--- a/drivers/usb/chipidea/ci_hdrc_msm.c
+++ b/drivers/usb/chipidea/ci_hdrc_msm.c
@@ -247,7 +247,7 @@ static int ci_hdrc_msm_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_mux;
 
-	ulpi_node = of_find_node_by_name(of_node_get(pdev->dev.of_node), "ulpi");
+	ulpi_node = of_get_child_by_name(pdev->dev.of_node, "ulpi");
 	if (ulpi_node) {
 		phy_node = of_get_next_available_child(ulpi_node, NULL);
 		ci->hsic = of_device_is_compatible(phy_node, "qcom,usb-hsic-phy");

From a5f1005517534aeb1fac20180badfbf0896c183c Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Fri, 1 Dec 2017 18:47:32 +0100
Subject: [PATCH 031/305] s390/pci: handle insufficient resources during dma
 tlb flush

In a virtualized setup lazy flushing can lead to the hypervisor
running out of resources when lots of guest pages need to be
pinned. In this situation simply trigger a global flush to give
the hypervisor a chance to free some of these resources.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/pci/pci_dma.c  | 21 +++++++++++++++++++--
 arch/s390/pci/pci_insn.c |  3 +++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index f7aa5a77827e..2d15d84c20ed 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -181,6 +181,9 @@ out_unlock:
 static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,
 			   size_t size, int flags)
 {
+	unsigned long irqflags;
+	int ret;
+
 	/*
 	 * With zdev->tlb_refresh == 0, rpcit is not required to establish new
 	 * translations when previously invalid translation-table entries are
@@ -196,8 +199,22 @@ static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,
 			return 0;
 	}
 
-	return zpci_refresh_trans((u64) zdev->fh << 32, dma_addr,
-				  PAGE_ALIGN(size));
+	ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr,
+				 PAGE_ALIGN(size));
+	if (ret == -ENOMEM && !s390_iommu_strict) {
+		/* enable the hypervisor to free some resources */
+		if (zpci_refresh_global(zdev))
+			goto out;
+
+		spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags);
+		bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap,
+			      zdev->lazy_bitmap, zdev->iommu_pages);
+		bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages);
+		spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags);
+		ret = 0;
+	}
+out:
+	return ret;
 }
 
 static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index 19bcb3b45a70..f069929e8211 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -89,6 +89,9 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
 	if (cc)
 		zpci_err_insn(cc, status, addr, range);
 
+	if (cc == 1 && (status == 4 || status == 16))
+		return -ENOMEM;
+
 	return (cc) ? -EIO : 0;
 }
 

From 366d8216488319ed29308b977cd62b7964a779b7 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 13 Dec 2017 09:21:59 +0100
Subject: [PATCH 032/305] s390/sclp: disable FORTIFY_SOURCE for early sclp code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Michal Suchánek reported the following compile error with
FORTIFY_SOURCE enabled:

drivers/s390/char/sclp_early_core.o: In function `memcpy':
include/linux/string.h:340: undefined reference to `fortify_panic'

To fix this simply disable FORTIFY_SOURCE on the early sclp code as
well, which I forgot on the initial commit.

Fixes: 79962038dffa ("s390: add support for FORTIFY_SOURCE")
Reported-by: Michal Suchánek <msuchanek@suse.de>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/char/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile
index 05ac6ba15a53..614b44e70a28 100644
--- a/drivers/s390/char/Makefile
+++ b/drivers/s390/char/Makefile
@@ -17,6 +17,8 @@ CFLAGS_REMOVE_sclp_early_core.o	+= $(CC_FLAGS_MARCH)
 CFLAGS_sclp_early_core.o		+= -march=z900
 endif
 
+CFLAGS_sclp_early_core.o		+= -D__NO_FORTIFY
+
 obj-y += ctrlchar.o keyboard.o defkeymap.o sclp.o sclp_rw.o sclp_quiesce.o \
 	 sclp_cmd.o sclp_config.o sclp_cpi_sys.o sclp_ocf.o sclp_ctl.o \
 	 sclp_early.o sclp_early_core.o

From 08933099e6404f588f81c2050bfec7313e06eeaf Mon Sep 17 00:00:00 2001
From: Daniele Palmas <dnlplm@gmail.com>
Date: Thu, 14 Dec 2017 16:54:45 +0100
Subject: [PATCH 033/305] USB: serial: option: add support for Telit ME910 PID
 0x1101

This patch adds support for PID 0x1101 of Telit ME910.

Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/option.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 3b3513874cfd..b02fb576b856 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -280,6 +280,7 @@ static void option_instat_callback(struct urb *urb);
 #define TELIT_PRODUCT_LE922_USBCFG3		0x1043
 #define TELIT_PRODUCT_LE922_USBCFG5		0x1045
 #define TELIT_PRODUCT_ME910			0x1100
+#define TELIT_PRODUCT_ME910_DUAL_MODEM		0x1101
 #define TELIT_PRODUCT_LE920			0x1200
 #define TELIT_PRODUCT_LE910			0x1201
 #define TELIT_PRODUCT_LE910_USBCFG4		0x1206
@@ -645,6 +646,11 @@ static const struct option_blacklist_info telit_me910_blacklist = {
 	.reserved = BIT(1) | BIT(3),
 };
 
+static const struct option_blacklist_info telit_me910_dual_modem_blacklist = {
+	.sendsetup = BIT(0),
+	.reserved = BIT(3),
+};
+
 static const struct option_blacklist_info telit_le910_blacklist = {
 	.sendsetup = BIT(0),
 	.reserved = BIT(1) | BIT(2),
@@ -1244,6 +1250,8 @@ static const struct usb_device_id option_ids[] = {
 		.driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910),
 		.driver_info = (kernel_ulong_t)&telit_me910_blacklist },
+	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM),
+		.driver_info = (kernel_ulong_t)&telit_me910_dual_modem_blacklist },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
 		.driver_info = (kernel_ulong_t)&telit_le910_blacklist },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4),

From 92a18a657fb2e2ffbfa0659af32cc18fd2346516 Mon Sep 17 00:00:00 2001
From: Reinhard Speyerer <rspmn@arcor.de>
Date: Fri, 15 Dec 2017 00:39:27 +0100
Subject: [PATCH 034/305] USB: serial: qcserial: add Sierra Wireless EM7565
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sierra Wireless EM7565 devices use the QCSERIAL_SWI layout for their
serial ports

T:  Bus=01 Lev=03 Prnt=29 Port=01 Cnt=02 Dev#= 31 Spd=480  MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=1199 ProdID=9091 Rev= 0.06
S:  Manufacturer=Sierra Wireless, Incorporated
S:  Product=Sierra Wireless EM7565 Qualcomm Snapdragon X16 LTE-A
S:  SerialNumber=xxxxxxxx
C:* #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=qcserial
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=qcserial
E:  Ad=83(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=qcserial
E:  Ad=85(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 8 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
E:  Ad=86(I) Atr=03(Int.) MxPS=   8 Ivl=32ms
E:  Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms

but need sendsetup = true for the NMEA port to make it work properly.

Simplify the patch compared to v1 as suggested by Bjørn Mork by taking
advantage of the fact that existing devices work with sendsetup = true
too.

Use sendsetup = true for the NMEA interface of QCSERIAL_SWI and add
DEVICE_SWI entries for the EM7565 PID 0x9091 and the EM7565 QDL PID
0x9090.

Tests with several MC73xx/MC74xx/MC77xx devices have been performed in
order to verify backward compatibility.

Signed-off-by: Reinhard Speyerer <rspmn@arcor.de>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/qcserial.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index e3892541a489..613f91add03d 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c
@@ -162,6 +162,8 @@ static const struct usb_device_id id_table[] = {
 	{DEVICE_SWI(0x1199, 0x9079)},	/* Sierra Wireless EM74xx */
 	{DEVICE_SWI(0x1199, 0x907a)},	/* Sierra Wireless EM74xx QDL */
 	{DEVICE_SWI(0x1199, 0x907b)},	/* Sierra Wireless EM74xx */
+	{DEVICE_SWI(0x1199, 0x9090)},	/* Sierra Wireless EM7565 QDL */
+	{DEVICE_SWI(0x1199, 0x9091)},	/* Sierra Wireless EM7565 */
 	{DEVICE_SWI(0x413c, 0x81a2)},	/* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */
 	{DEVICE_SWI(0x413c, 0x81a3)},	/* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */
 	{DEVICE_SWI(0x413c, 0x81a4)},	/* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */
@@ -342,6 +344,7 @@ static int qcprobe(struct usb_serial *serial, const struct usb_device_id *id)
 			break;
 		case 2:
 			dev_dbg(dev, "NMEA GPS interface found\n");
+			sendsetup = true;
 			break;
 		case 3:
 			dev_dbg(dev, "Modem port found\n");

From 046046737bd35bed047460f080ea47e186be731e Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 15 Nov 2017 10:43:16 +0100
Subject: [PATCH 035/305] phy: tegra: fix device-tree node lookups

Fix child-node lookups during probe, which ended up searching the whole
device tree depth-first starting at the parents rather than just
matching on their children.

To make things worse, some parent nodes could end up being being
prematurely freed (by tegra_xusb_pad_register()) as
of_find_node_by_name() drops a reference to its first argument.

Fixes: 53d2a715c240 ("phy: Add Tegra XUSB pad controller support")
Cc: stable <stable@vger.kernel.org>     # 4.7
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/tegra/xusb.c | 58 ++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
index 4307bf0013e1..63e916d4d069 100644
--- a/drivers/phy/tegra/xusb.c
+++ b/drivers/phy/tegra/xusb.c
@@ -75,14 +75,14 @@ MODULE_DEVICE_TABLE(of, tegra_xusb_padctl_of_match);
 static struct device_node *
 tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
 {
-	/*
-	 * of_find_node_by_name() drops a reference, so make sure to grab one.
-	 */
-	struct device_node *np = of_node_get(padctl->dev->of_node);
+	struct device_node *pads, *np;
 
-	np = of_find_node_by_name(np, "pads");
-	if (np)
-		np = of_find_node_by_name(np, name);
+	pads = of_get_child_by_name(padctl->dev->of_node, "pads");
+	if (!pads)
+		return NULL;
+
+	np = of_get_child_by_name(pads, name);
+	of_node_put(pads);
 
 	return np;
 }
@@ -90,16 +90,16 @@ tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
 static struct device_node *
 tegra_xusb_pad_find_phy_node(struct tegra_xusb_pad *pad, unsigned int index)
 {
-	/*
-	 * of_find_node_by_name() drops a reference, so make sure to grab one.
-	 */
-	struct device_node *np = of_node_get(pad->dev.of_node);
+	struct device_node *np, *lanes;
 
-	np = of_find_node_by_name(np, "lanes");
-	if (!np)
+	lanes = of_get_child_by_name(pad->dev.of_node, "lanes");
+	if (!lanes)
 		return NULL;
 
-	return of_find_node_by_name(np, pad->soc->lanes[index].name);
+	np = of_get_child_by_name(lanes, pad->soc->lanes[index].name);
+	of_node_put(lanes);
+
+	return np;
 }
 
 static int
@@ -195,7 +195,7 @@ int tegra_xusb_pad_register(struct tegra_xusb_pad *pad,
 	unsigned int i;
 	int err;
 
-	children = of_find_node_by_name(pad->dev.of_node, "lanes");
+	children = of_get_child_by_name(pad->dev.of_node, "lanes");
 	if (!children)
 		return -ENODEV;
 
@@ -444,21 +444,21 @@ static struct device_node *
 tegra_xusb_find_port_node(struct tegra_xusb_padctl *padctl, const char *type,
 			  unsigned int index)
 {
-	/*
-	 * of_find_node_by_name() drops a reference, so make sure to grab one.
-	 */
-	struct device_node *np = of_node_get(padctl->dev->of_node);
+	struct device_node *ports, *np;
+	char *name;
 
-	np = of_find_node_by_name(np, "ports");
-	if (np) {
-		char *name;
+	ports = of_get_child_by_name(padctl->dev->of_node, "ports");
+	if (!ports)
+		return NULL;
 
-		name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
-		if (!name)
-			return ERR_PTR(-ENOMEM);
-		np = of_find_node_by_name(np, name);
-		kfree(name);
+	name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
+	if (!name) {
+		of_node_put(ports);
+		return ERR_PTR(-ENOMEM);
 	}
+	np = of_get_child_by_name(ports, name);
+	kfree(name);
+	of_node_put(ports);
 
 	return np;
 }
@@ -847,7 +847,7 @@ static void tegra_xusb_remove_ports(struct tegra_xusb_padctl *padctl)
 
 static int tegra_xusb_padctl_probe(struct platform_device *pdev)
 {
-	struct device_node *np = of_node_get(pdev->dev.of_node);
+	struct device_node *np = pdev->dev.of_node;
 	const struct tegra_xusb_padctl_soc *soc;
 	struct tegra_xusb_padctl *padctl;
 	const struct of_device_id *match;
@@ -855,7 +855,7 @@ static int tegra_xusb_padctl_probe(struct platform_device *pdev)
 	int err;
 
 	/* for backwards compatibility with old device trees */
-	np = of_find_node_by_name(np, "pads");
+	np = of_get_child_by_name(np, "pads");
 	if (!np) {
 		dev_warn(&pdev->dev, "deprecated DT, using legacy driver\n");
 		return tegra_xusb_padctl_legacy_probe(pdev);

From e796cc6a3a9186c92092e2f5929cf8f65b56cf01 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Fri, 17 Nov 2017 16:55:35 +0530
Subject: [PATCH 036/305] phy: cpcap-usb: Fix platform_get_irq_byname's error
 checking.

The platform_get_irq_byname() function returns negative if an error occurs.
zero or positive number on success. platform_get_irq_byname() error
checking for zero is not correct.

Fixes: 6d6ce40f63af ("phy: cpcap-usb: Add CPCAP PMIC USB support")
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Reviewed-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/motorola/phy-cpcap-usb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/phy/motorola/phy-cpcap-usb.c b/drivers/phy/motorola/phy-cpcap-usb.c
index accaaaccb662..6601ad0dfb3a 100644
--- a/drivers/phy/motorola/phy-cpcap-usb.c
+++ b/drivers/phy/motorola/phy-cpcap-usb.c
@@ -310,7 +310,7 @@ static int cpcap_usb_init_irq(struct platform_device *pdev,
 	int irq, error;
 
 	irq = platform_get_irq_byname(pdev, name);
-	if (!irq)
+	if (irq < 0)
 		return -ENODEV;
 
 	error = devm_request_threaded_irq(ddata->dev, irq, NULL,

From 3cb0ab6e008f2a9ffe2d1be4246984003caed7e2 Mon Sep 17 00:00:00 2001
From: Chris Zhong <zyw@rock-chips.com>
Date: Thu, 8 Sep 2016 10:38:11 -0700
Subject: [PATCH 037/305] phy: rockchip-typec: add pm_runtime_disable in err
 case

Add pm_runtime_disable in err case to make the pm_runtime_enable/disable
is invoked balanced.

Signed-off-by: Chris Zhong <zyw@rock-chips.com>
Reviewed-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/rockchip/phy-rockchip-typec.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/phy/rockchip/phy-rockchip-typec.c b/drivers/phy/rockchip/phy-rockchip-typec.c
index ee85fa0ca4b0..7492c8978217 100644
--- a/drivers/phy/rockchip/phy-rockchip-typec.c
+++ b/drivers/phy/rockchip/phy-rockchip-typec.c
@@ -1137,6 +1137,7 @@ static int rockchip_typec_phy_probe(struct platform_device *pdev)
 		if (IS_ERR(phy)) {
 			dev_err(dev, "failed to create phy: %s\n",
 				child_np->name);
+			pm_runtime_disable(dev);
 			return PTR_ERR(phy);
 		}
 
@@ -1146,6 +1147,7 @@ static int rockchip_typec_phy_probe(struct platform_device *pdev)
 	phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
 	if (IS_ERR(phy_provider)) {
 		dev_err(dev, "Failed to register phy provider\n");
+		pm_runtime_disable(dev);
 		return PTR_ERR(phy_provider);
 	}
 

From 2b88212c4cc67ff33dec5bb4d690044b97a5f979 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 2 Nov 2017 12:56:36 +0100
Subject: [PATCH 038/305] phy: rcar-gen3-usb2: select USB_COMMON

When USB is disabled, we get a link error for this driver
because of the added OTG support

drivers/phy/renesas/phy-rcar-gen3-usb2.o: In function `rcar_gen3_phy_usb2_probe':
phy-rcar-gen3-usb2.c:(.text+0x250): undefined reference to `of_usb_get_dr_mode_by_phy'

Other phy drivers select USB_COMMON for this, so let's do the same
here.

Fixes: 7e0540f41332 ("phy: rcar-gen3-usb2: check dr_mode for otg mode")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/renesas/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/phy/renesas/Kconfig b/drivers/phy/renesas/Kconfig
index cb09245e9b4c..c845facacb06 100644
--- a/drivers/phy/renesas/Kconfig
+++ b/drivers/phy/renesas/Kconfig
@@ -12,7 +12,9 @@ config PHY_RCAR_GEN3_USB2
 	tristate "Renesas R-Car generation 3 USB 2.0 PHY driver"
 	depends on ARCH_RENESAS
 	depends on EXTCON
+	depends on USB_SUPPORT
 	select GENERIC_PHY
+	select USB_COMMON
 	help
 	  Support for USB 2.0 PHY found on Renesas R-Car generation 3 SoCs.
 

From 50034ed49645463a16327cad05694e201e6b4126 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 15 Dec 2017 05:09:47 -0800
Subject: [PATCH 039/305] cgroup: use strlcpy() instead of strscpy() to avoid
 spurious warning

As long as cft->name is guaranteed to be NUL-terminated, using strlcpy() would
work just as well and avoid that warning, so the change below could be folded
into that commit.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 18d71fbd3923..f4c2f8cb5748 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
 			 cft->name);
 	else
-		strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+		strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
 }
 
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
 
 	root->flags = opts->flags;
 	if (opts->release_agent)
-		strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
+		strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX);
 	if (opts->name)
-		strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
+		strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
 	if (opts->cpuset_clone_children)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }

From 9f37e797547cca9d14fe1f0f43f5c89b261ff0b0 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 15 Dec 2017 14:16:04 +0100
Subject: [PATCH 040/305] s390: fix preemption race in disable_sacf_uaccess

With CONFIG_PREEMPT=y there is a possible race in disable_sacf_uaccess.

The new set_fs value needs to be stored the the task structure first,
the control register update needs to be second. Otherwise a preemptive
schedule may interrupt the code right after the control register update
has been done and the next time the task is scheduled we get an incorrect
value in the control register due to the old set_fs setting.

Fixes: 0aaba41b58 ("s390: remove all code using the access register mode")
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/lib/uaccess.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index cae5a1e16cbd..c4f8039a35e8 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -89,11 +89,11 @@ EXPORT_SYMBOL(enable_sacf_uaccess);
 
 void disable_sacf_uaccess(mm_segment_t old_fs)
 {
+	current->thread.mm_segment = old_fs;
 	if (old_fs == USER_DS && test_facility(27)) {
 		__ctl_load(S390_lowcore.user_asce, 1, 1);
 		clear_cpu_flag(CIF_ASCE_PRIMARY);
 	}
-	current->thread.mm_segment = old_fs;
 }
 EXPORT_SYMBOL(disable_sacf_uaccess);
 

From c739f930be1dd5fd949030e3475a884fe06dae9b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Dec 2017 07:56:36 -0800
Subject: [PATCH 041/305] x86/espfix/64: Fix espfix double-fault handling on
 5-level systems

Using PGDIR_SHIFT to identify espfix64 addresses on 5-level systems
was wrong, and it resulted in panics due to unhandled double faults.
Use P4D_SHIFT instead, which is correct on 4-level and 5-level
machines.

This fixes a panic when running x86 selftests on 5-level machines.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: 1d33b219563f ("x86/espfix: Add support for 5-level paging")
Link: http://lkml.kernel.org/r/24c898b4f44fdf8c22d93703850fb384ef87cfdc.1513035461.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b7b0f74a2150..c751518936ac 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -355,7 +355,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	 *
 	 * No need for ist_enter here because we don't use RCU.
 	 */
-	if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
+	if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
 		regs->cs == __KERNEL_CS &&
 		regs->ip == (unsigned long)native_irq_return_iret)
 	{

From f57ab9a01a36ef3454333251cc57e3a9948b17bf Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Fri, 17 Nov 2017 11:56:41 +0000
Subject: [PATCH 042/305] drivers: base: cacheinfo: fix cache type for
 non-architected system cache

Commit dfea747d2aba ("drivers: base: cacheinfo: support DT overrides for
cache properties") doesn't initialise the cache type if it's present
only in DT and the architecture is not aware of it. They are unified
system level cache which are generally transparent.

This patch check if the cache type is set to NOCACHE but the DT node
indicates that it's unified cache and sets the cache type accordingly.

Fixes: dfea747d2aba ("drivers: base: cacheinfo: support DT overrides for cache properties")
Reported-and-tested-by: Tan Xiaojun <tanxiaojun@huawei.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/cacheinfo.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index eb3af2739537..07532d83be0b 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -186,6 +186,11 @@ static void cache_associativity(struct cacheinfo *this_leaf)
 		this_leaf->ways_of_associativity = (size / nr_sets) / line_size;
 }
 
+static bool cache_node_is_unified(struct cacheinfo *this_leaf)
+{
+	return of_property_read_bool(this_leaf->of_node, "cache-unified");
+}
+
 static void cache_of_override_properties(unsigned int cpu)
 {
 	int index;
@@ -194,6 +199,14 @@ static void cache_of_override_properties(unsigned int cpu)
 
 	for (index = 0; index < cache_leaves(cpu); index++) {
 		this_leaf = this_cpu_ci->info_list + index;
+		/*
+		 * init_cache_level must setup the cache level correctly
+		 * overriding the architecturally specified levels, so
+		 * if type is NONE at this stage, it should be unified
+		 */
+		if (this_leaf->type == CACHE_TYPE_NOCACHE &&
+		    cache_node_is_unified(this_leaf))
+			this_leaf->type = CACHE_TYPE_UNIFIED;
 		cache_size(this_leaf);
 		cache_get_line_size(this_leaf);
 		cache_nr_sets(this_leaf);

From 5f0e3fe6b1504d4e6530294ec87c473aa6d2d02f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Tue, 14 Nov 2017 09:10:11 -0500
Subject: [PATCH 043/305] x86/build: Make isoimage work on Debian

Debian does not ship a 'mkisofs' symlink to genisoimage.  All modern
distros ship genisoimage, so just use that directly.  That requires
renaming the 'genisoimage' function.  Also neaten up the 'for' loop
while I'm in here.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Changbin Du <changbin.du@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/genimage.sh | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
index c9e8499fbfe7..6a10d52a4145 100644
--- a/arch/x86/boot/genimage.sh
+++ b/arch/x86/boot/genimage.sh
@@ -80,39 +80,43 @@ genfdimage288() {
 	mcopy $FBZIMAGE w:linux
 }
 
-genisoimage() {
+geniso() {
 	tmp_dir=`dirname $FIMAGE`/isoimage
 	rm -rf $tmp_dir
 	mkdir $tmp_dir
-	for i in lib lib64 share end ; do
+	for i in lib lib64 share ; do
 		for j in syslinux ISOLINUX ; do
 			if [ -f /usr/$i/$j/isolinux.bin ] ; then
 				isolinux=/usr/$i/$j/isolinux.bin
-				cp $isolinux $tmp_dir
 			fi
 		done
 		for j in syslinux syslinux/modules/bios ; do
 			if [ -f /usr/$i/$j/ldlinux.c32 ]; then
 				ldlinux=/usr/$i/$j/ldlinux.c32
-				cp $ldlinux $tmp_dir
 			fi
 		done
 		if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
 			break
 		fi
-		if [ $i = end -a -z "$isolinux" ] ; then
-			echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
-			exit 1
-		fi
 	done
+	if [ -z "$isolinux" ] ; then
+		echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
+		exit 1
+	fi
+	if [ -z "$ldlinux" ] ; then
+		echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.'
+		exit 1
+	fi
+	cp $isolinux $tmp_dir
+	cp $ldlinux $tmp_dir
 	cp $FBZIMAGE $tmp_dir/linux
 	echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
 	if [ -f "$FDINITRD" ] ; then
 		cp "$FDINITRD" $tmp_dir/initrd.img
 	fi
-	mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \
-		-c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \
-		$tmp_dir
+	genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \
+		-b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \
+		-boot-info-table $tmp_dir
 	isohybrid $FIMAGE 2>/dev/null || true
 	rm -rf $tmp_dir
 }
@@ -121,6 +125,6 @@ case $1 in
 	bzdisk)     genbzdisk;;
 	fdimage144) genfdimage144;;
 	fdimage288) genfdimage288;;
-	isoimage)   genisoimage;;
+	isoimage)   geniso;;
 	*)          echo 'Unknown image format'; exit 1;
 esac

From cce1fea50e3be6b78fc677e8cf20cd0ca4c851b0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 1 Dec 2017 15:08:03 +0300
Subject: [PATCH 044/305] thunderbolt: Make pathname to force_power shorter

WMI is the bus inside kernel, so, we may access the GUID via
/sys/bus/wmi instead of doing this through /sys/devices path.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Mario Limonciello <mario.limonciello@dell.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/thunderbolt.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst
index de50a8561774..9b55952039a6 100644
--- a/Documentation/admin-guide/thunderbolt.rst
+++ b/Documentation/admin-guide/thunderbolt.rst
@@ -230,7 +230,7 @@ If supported by your machine this will be exposed by the WMI bus with
 a sysfs attribute called "force_power".
 
 For example the intel-wmi-thunderbolt driver exposes this attribute in:
-  /sys/devices/platform/PNP0C14:00/wmi_bus/wmi_bus-PNP0C14:00/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
+  /sys/bus/wmi/devices/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
 
   To force the power to on, write 1 to this attribute file.
   To disable force power, write 0 to this attribute file.

From 78dfa29c84bab548910490cf7508c53ad99d1d9e Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Fri, 1 Dec 2017 15:08:04 +0300
Subject: [PATCH 045/305] MAINTAINERS: Add thunderbolt.rst to the Thunderbolt
 driver entry

Make sure Thunderbolt maintainers get to see patches that touch
documentation of the Thunderbolt driver as well.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 82ad0eabce4f..5da966e19e8a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13492,6 +13492,7 @@ M:	Mika Westerberg <mika.westerberg@linux.intel.com>
 M:	Yehezkel Bernat <yehezkel.bernat@intel.com>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
 S:	Maintained
+F:	Documentation/admin-guide/thunderbolt.rst
 F:	drivers/thunderbolt/
 F:	include/linux/thunderbolt.h
 

From 74657181e7c449351d1ad28cf43941bc333e1bd6 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Fri, 1 Dec 2017 15:08:05 +0300
Subject: [PATCH 046/305] thunderbolt: Mask ring interrupt properly when
 polling starts

When ring enters polling mode we are expected to mask the ring interrupt
before the callback is called. However, the current code actually
unmasks it probably because of a copy-paste mistake.

Mask the interrupt properly from now on.

Fixes: 4ffe722eefcb ("thunderbolt: Add polling mode for rings")
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/thunderbolt/nhi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 419a7a90bce0..f45bcbc63738 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -339,7 +339,7 @@ static void __ring_interrupt(struct tb_ring *ring)
 		return;
 
 	if (ring->start_poll) {
-		__ring_interrupt_mask(ring, false);
+		__ring_interrupt_mask(ring, true);
 		ring->start_poll(ring->poll_data);
 	} else {
 		schedule_work(&ring->work);

From 9d5f38ba6c82359b7cec31fb27fb78ecc02f3946 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 15 Dec 2017 10:20:12 -0600
Subject: [PATCH 047/305] x86/mm: Unbreak modules that use the DMA API

Commit d8aa7eea78a1 ("x86/mm: Add Secure Encrypted Virtualization (SEV)
support") changed sme_active() from an inline function that referenced
sme_me_mask to a non-inlined function in order to make the sev_enabled
variable a static variable.  This function was marked EXPORT_SYMBOL_GPL
because at the time the patch was submitted, sme_me_mask was marked
EXPORT_SYMBOL_GPL.

Commit 87df26175e67 ("x86/mm: Unbreak modules that rely on external
PAGE_KERNEL availability") changed sme_me_mask variable from
EXPORT_SYMBOL_GPL to EXPORT_SYMBOL, allowing external modules the ability
to build with CONFIG_AMD_MEM_ENCRYPT=y.  Now, however, with sev_active()
no longer an inline function and marked as EXPORT_SYMBOL_GPL, external
modules that use the DMA API are once again broken in 4.15. Since the DMA
API is meant to be used by external modules, this needs to be changed.

Change the sme_active() and sev_active() functions from EXPORT_SYMBOL_GPL
to EXPORT_SYMBOL.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Link: https://lkml.kernel.org/r/20171215162011.14125.7113.stgit@tlendack-t1.amdoffice.net
---
 arch/x86/mm/mem_encrypt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index d9a9e9fc75dd..391b13402e40 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -405,13 +405,13 @@ bool sme_active(void)
 {
 	return sme_me_mask && !sev_enabled;
 }
-EXPORT_SYMBOL_GPL(sme_active);
+EXPORT_SYMBOL(sme_active);
 
 bool sev_active(void)
 {
 	return sme_me_mask && sev_enabled;
 }
-EXPORT_SYMBOL_GPL(sev_active);
+EXPORT_SYMBOL(sev_active);
 
 static const struct dma_map_ops sev_dma_ops = {
 	.alloc                  = sev_alloc,

From bf29cb238dc0656e6564b6a94bb82e11d2129437 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 14 Dec 2017 19:18:25 +0100
Subject: [PATCH 048/305] sched/isolation: Make CONFIG_NO_HZ_FULL select
 CONFIG_CPU_ISOLATION

CONFIG_NO_HZ_FULL doesn't make sense without CONFIG_CPU_ISOLATION. In
fact enabling the first without the second is a regression as nohz_full=
boot parameter gets silently ignored.

Besides this unnatural combination hangs RCU gp kthread when running
rcutorture for reasons that are not yet fully understood:

	rcu_preempt kthread starved for 9974 jiffies! g4294967208
	+c4294967207 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x402 ->cpu=0
	rcu_preempt     I 7464     8      2 0x80000000
	Call Trace:
		__schedule+0x493/0x620
		schedule+0x24/0x40
		schedule_timeout+0x330/0x3b0
		? preempt_count_sub+0xea/0x140
		? collect_expired_timers+0xb0/0xb0
		rcu_gp_kthread+0x6bf/0xef0

This commit therefore makes NO_HZ_FULL select CPU_ISOLATION, which
prevents all these bad behaviours.

Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Fixes: 5c4991e24c69 ("sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL")
Link: http://lkml.kernel.org/r/1513275507-29200-2-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e776fc8cc1df..f6b5f19223d6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -95,6 +95,7 @@ config NO_HZ_FULL
 	select RCU_NOCB_CPU
 	select VIRT_CPU_ACCOUNTING_GEN
 	select IRQ_WORK
+	select CPU_ISOLATION
 	help
 	 Adaptively try to shutdown the tick whenever possible, even when
 	 the CPU is running tasks. Typically this requires running a single

From 2c43838c99d9d23f17eb2bdadafcb2879cca6995 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 14 Dec 2017 19:18:26 +0100
Subject: [PATCH 049/305] sched/isolation: Enable CONFIG_CPU_ISOLATION=y by
 default

The "isolcpus=" boot parameter support was always built-in before we
moved the related code under CONFIG_CPU_ISOLATION. Having it disabled by
default is very confusing for people accustomed to use this parameter.

So enable it by dafault to keep the previous behaviour but keep it
optable for those who want to tinify their kernels.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: kernel test robot <xiaolong.ye@intel.com>
Link: http://lkml.kernel.org/r/1513275507-29200-3-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 init/Kconfig | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index 2934249fba46..690a381adee0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -461,10 +461,14 @@ endmenu # "CPU/Task time and stats accounting"
 
 config CPU_ISOLATION
 	bool "CPU isolation"
+	default y
 	help
 	  Make sure that CPUs running critical tasks are not disturbed by
 	  any source of "noise" such as unbound workqueues, timers, kthreads...
-	  Unbound jobs get offloaded to housekeeping CPUs.
+	  Unbound jobs get offloaded to housekeeping CPUs. This is driven by
+	  the "isolcpus=" boot parameter.
+
+	  Say Y if unsure.
 
 source "kernel/rcu/Kconfig"
 

From d94d105329e4a8a874853b5bd854b6587c41adda Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 14 Dec 2017 19:18:27 +0100
Subject: [PATCH 050/305] sched/isolation: Document boot parameters dependency
 on CONFIG_CPU_ISOLATION=y

The "isolcpus=" and "nohz_full=" boot parameters depend on CPU Isolation
support. Let's document that.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: kernel test robot <xiaolong.ye@intel.com>
Link: http://lkml.kernel.org/r/1513275507-29200-4-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.rst | 1 +
 Documentation/admin-guide/kernel-parameters.txt | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index b2598cc9834c..7242cbda15dd 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -109,6 +109,7 @@ parameter is applicable::
 	IPV6	IPv6 support is enabled.
 	ISAPNP	ISA PnP code is enabled.
 	ISDN	Appropriate ISDN support is enabled.
+	ISOL	CPU Isolation is enabled.
 	JOY	Appropriate joystick support is enabled.
 	KGDB	Kernel debugger support is enabled.
 	KVM	Kernel Virtual Machine support is enabled.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6571fbfdb2a1..168310707ec2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1737,7 +1737,7 @@
 	isapnp=		[ISAPNP]
 			Format: <RDP>,<reset>,<pci_scan>,<verbosity>
 
-	isolcpus=	[KNL,SMP] Isolate a given set of CPUs from disturbance.
+	isolcpus=	[KNL,SMP,ISOL] Isolate a given set of CPUs from disturbance.
 			[Deprecated - use cpusets instead]
 			Format: [flag-list,]<cpu-list>
 
@@ -2662,7 +2662,7 @@
 			Valid arguments: on, off
 			Default: on
 
-	nohz_full=	[KNL,BOOT]
+	nohz_full=	[KNL,BOOT,SMP,ISOL]
 			The argument is a cpu list, as described above.
 			In kernels built with CONFIG_NO_HZ_FULL=y, set
 			the specified list of CPUs whose tick will be stopped

From 869b5567e12f63ea7407f81728ca87f8c0abbfdb Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Tue, 14 Nov 2017 06:53:32 -0700
Subject: [PATCH 051/305] vmbus: unregister device_obj->channels_kset

Without the patch, a device can't be thoroughly destroyed, because
vmbus_device_register() -> kset_create_and_add() still holds a reference
to the hv_device's device.kobj.

Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Fixes: c2e5df616e1a ("vmbus: add per-channel sysfs info")
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/vmbus_drv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 76ed9a216f10..610223f0e945 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1378,6 +1378,8 @@ void vmbus_device_unregister(struct hv_device *device_obj)
 	pr_debug("child device %s unregistered\n",
 		dev_name(&device_obj->device));
 
+	kset_unregister(device_obj->channels_kset);
+
 	/*
 	 * Kick off the process of unregistering the device.
 	 * This will call vmbus_remove() and eventually vmbus_device_release()

From 7f3dc0088b98533f17128058fac73cd8b2752ef1 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@android.com>
Date: Mon, 27 Nov 2017 09:32:33 -0800
Subject: [PATCH 052/305] binder: fix proc->files use-after-free

proc->files cleanup is initiated by binder_vma_close. Therefore
a reference on the binder_proc is not enough to prevent the
files_struct from being released while the binder_proc still has
a reference. This can lead to an attempt to dereference the
stale pointer obtained from proc->files prior to proc->files
cleanup. This has been seen once in task_get_unused_fd_flags()
when __alloc_fd() is called with a stale "files".

The fix is to protect proc->files with a mutex to prevent cleanup
while in use.

Signed-off-by: Todd Kjos <tkjos@google.com>
Cc: stable <stable@vger.kernel.org> # 4.14
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c | 44 ++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index bccec9de0533..a7ecfde66b7b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -482,7 +482,8 @@ enum binder_deferred_state {
  * @tsk                   task_struct for group_leader of process
  *                        (invariant after initialized)
  * @files                 files_struct for process
- *                        (invariant after initialized)
+ *                        (protected by @files_lock)
+ * @files_lock            mutex to protect @files
  * @deferred_work_node:   element for binder_deferred_list
  *                        (protected by binder_deferred_lock)
  * @deferred_work:        bitmap of deferred work to perform
@@ -530,6 +531,7 @@ struct binder_proc {
 	int pid;
 	struct task_struct *tsk;
 	struct files_struct *files;
+	struct mutex files_lock;
 	struct hlist_node deferred_work_node;
 	int deferred_work;
 	bool is_dead;
@@ -877,20 +879,26 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
 
 static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 {
-	struct files_struct *files = proc->files;
 	unsigned long rlim_cur;
 	unsigned long irqs;
+	int ret;
 
-	if (files == NULL)
-		return -ESRCH;
-
-	if (!lock_task_sighand(proc->tsk, &irqs))
-		return -EMFILE;
-
+	mutex_lock(&proc->files_lock);
+	if (proc->files == NULL) {
+		ret = -ESRCH;
+		goto err;
+	}
+	if (!lock_task_sighand(proc->tsk, &irqs)) {
+		ret = -EMFILE;
+		goto err;
+	}
 	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
 	unlock_task_sighand(proc->tsk, &irqs);
 
-	return __alloc_fd(files, 0, rlim_cur, flags);
+	ret = __alloc_fd(proc->files, 0, rlim_cur, flags);
+err:
+	mutex_unlock(&proc->files_lock);
+	return ret;
 }
 
 /*
@@ -899,8 +907,10 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 static void task_fd_install(
 	struct binder_proc *proc, unsigned int fd, struct file *file)
 {
+	mutex_lock(&proc->files_lock);
 	if (proc->files)
 		__fd_install(proc->files, fd, file);
+	mutex_unlock(&proc->files_lock);
 }
 
 /*
@@ -910,9 +920,11 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 {
 	int retval;
 
-	if (proc->files == NULL)
-		return -ESRCH;
-
+	mutex_lock(&proc->files_lock);
+	if (proc->files == NULL) {
+		retval = -ESRCH;
+		goto err;
+	}
 	retval = __close_fd(proc->files, fd);
 	/* can't restart close syscall because file table entry was cleared */
 	if (unlikely(retval == -ERESTARTSYS ||
@@ -920,7 +932,8 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 		     retval == -ERESTARTNOHAND ||
 		     retval == -ERESTART_RESTARTBLOCK))
 		retval = -EINTR;
-
+err:
+	mutex_unlock(&proc->files_lock);
 	return retval;
 }
 
@@ -4627,7 +4640,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	ret = binder_alloc_mmap_handler(&proc->alloc, vma);
 	if (ret)
 		return ret;
+	mutex_lock(&proc->files_lock);
 	proc->files = get_files_struct(current);
+	mutex_unlock(&proc->files_lock);
 	return 0;
 
 err_bad_arg:
@@ -4651,6 +4666,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	spin_lock_init(&proc->outer_lock);
 	get_task_struct(current->group_leader);
 	proc->tsk = current->group_leader;
+	mutex_init(&proc->files_lock);
 	INIT_LIST_HEAD(&proc->todo);
 	proc->default_priority = task_nice(current);
 	binder_dev = container_of(filp->private_data, struct binder_device,
@@ -4903,9 +4919,11 @@ static void binder_deferred_func(struct work_struct *work)
 
 		files = NULL;
 		if (defer & BINDER_DEFERRED_PUT_FILES) {
+			mutex_lock(&proc->files_lock);
 			files = proc->files;
 			if (files)
 				proc->files = NULL;
+			mutex_unlock(&proc->files_lock);
 		}
 
 		if (defer & BINDER_DEFERRED_FLUSH)

From 5cfee7a357f60675cae32b494bb2096d7203efd3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 8 Nov 2017 11:27:37 +0100
Subject: [PATCH 053/305] perf tools: Use shell function for perl cflags
 retrieval

Using the shell function for perl CFLAGS retrieval instead of back
quotes (``). Both execute shell with the command, but the latter is more
explicit and seems to be the preferred way.

Also we don't have any other use of the back quotes in perf Makefiles.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171108102739.30338-2-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index ed65e82f034e..710623ddb8af 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -583,7 +583,7 @@ else
   PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null)
   PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
   PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
-  PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
+  PERL_EMBED_CCOPTS = $(shell perl -MExtUtils::Embed -e ccopts 2>/dev/null)
   FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 
   ifneq ($(feature-libperl), 1)

From 61fb26a6a23c0f1a07a0f8a11b54bafb1ac2398b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 4 Dec 2017 12:23:08 -0300
Subject: [PATCH 054/305] perf tools: Fix up build in hardened environments

On Fedora systems the perl and python CFLAGS/LDFLAGS include the
hardened specs from redhat-rpm-config package. We apply them only for
perl/python objects, which makes them not compatible with the rest of
the objects and the build fails with:

  /usr/bin/ld: perf-in.o: relocation R_X86_64_32 against `.rodata.str1.1' can not be used when making a shared object; recompile with -f
+PIC
  /usr/bin/ld: libperf.a(libperf-in.o): relocation R_X86_64_32S against `.text' can not be used when making a shared object; recompile w
+ith -fPIC
  /usr/bin/ld: final link failed: Nonrepresentable section on output
  collect2: error: ld returned 1 exit status
  make[2]: *** [Makefile.perf:507: perf] Error 1
  make[1]: *** [Makefile.perf:210: sub-make] Error 2
  make: *** [Makefile:69: all] Error 2

Mainly it's caused by perl/python objects being compiled with:

  -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1

which prevent the final link impossible, because it will check
for 'proper' objects with following option:

  -specs=/usr/lib/rpm/redhat/redhat-hardened-ld

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20171204082437.GC30564@krava
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 710623ddb8af..0294bfb6c5f8 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -188,9 +188,7 @@ ifdef PYTHON_CONFIG
   PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
   PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil
   PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
-  ifeq ($(CC_NO_CLANG), 1)
-    PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
-  endif
+  PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
   FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
 endif
 
@@ -576,7 +574,6 @@ ifndef NO_GTK2
   endif
 endif
 
-
 ifdef NO_LIBPERL
   CFLAGS += -DNO_LIBPERL
 else
@@ -584,6 +581,8 @@ else
   PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
   PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
   PERL_EMBED_CCOPTS = $(shell perl -MExtUtils::Embed -e ccopts 2>/dev/null)
+  PERL_EMBED_CCOPTS := $(filter-out -specs=%,$(PERL_EMBED_CCOPTS))
+  PERL_EMBED_LDOPTS := $(filter-out -specs=%,$(PERL_EMBED_LDOPTS))
   FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 
   ifneq ($(feature-libperl), 1)

From ca58d7e64bdfc54f7dfe46713c1e2acc68d7522d Mon Sep 17 00:00:00 2001
From: Ben Gainey <ben.gainey@arm.com>
Date: Wed, 22 Nov 2017 18:25:41 -0600
Subject: [PATCH 055/305] perf jvmti: Generate correct debug information for
 inlined code

tools/perf/jvmti is broken in so far as it generates incorrect debug
information. Specifically it attributes all debug lines to the original
method being output even in the case that some code is being inlined
from elsewhere.  This patch fixes the issue.

To test (from within linux/tools/perf):

export JDIR=/usr/lib/jvm/java-8-openjdk-amd64/
make
cat << __EOF > Test.java
public class Test
{
    private StringBuilder b = new StringBuilder();

    private void loop(int i, String... args)
    {
        for (String a : args)
            b.append(a);

        long hc = b.hashCode() * System.nanoTime();

        b = new StringBuilder();
        b.append(hc);

        System.out.printf("Iteration %d = %d\n", i, hc);
    }

    public void run(String... args)
    {
        for (int i = 0; i < 10000; ++i)
        {
            loop(i, args);
        }
    }

    public static void main(String... args)
    {
        Test t = new Test();
        t.run(args);
    }
}
__EOF
$JDIR/bin/javac Test.java
./perf record -F 10000 -g -k mono $JDIR/bin/java -agentpath:`pwd`/libperf-jvmti.so Test
./perf inject --jit -i perf.data -o perf.data.jitted
./perf annotate -i perf.data.jitted --stdio | grep Test\.java: | sort -u

Before this patch, Test.java line numbers get reported that are greater
than the number of lines in the Test.java file.  They come from the
source file of the inlined function, e.g. java/lang/String.java:1085.
For further validation one can examine those lines in the JDK source
distribution and confirm that they map to inlined functions called by
Test.java.

After this patch, the filename of the inlined function is output
rather than the incorrect original source filename.

Signed-off-by: Ben Gainey <ben.gainey@arm.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Stephane Eranian <eranian@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Colin King <colin.king@canonical.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 598b7c6919c7 ("perf jit: add source line info support")
Link: http://lkml.kernel.org/r/20171122182541.d25599a3eb1ada3480d142fa@arm.com
Signed-off-by: Kim Phillips <kim.phillips@arm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/jvmti/jvmti_agent.c |  16 ++--
 tools/perf/jvmti/jvmti_agent.h |   7 +-
 tools/perf/jvmti/libjvmti.c    | 147 +++++++++++++++++++++++++++------
 3 files changed, 134 insertions(+), 36 deletions(-)

diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c
index cf36de7ea255..0c6d1002b524 100644
--- a/tools/perf/jvmti/jvmti_agent.c
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -384,13 +384,13 @@ jvmti_write_code(void *agent, char const *sym,
 }
 
 int
-jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
-		       jvmti_line_info_t *li, int nr_lines)
+jvmti_write_debug_info(void *agent, uint64_t code,
+    int nr_lines, jvmti_line_info_t *li,
+    const char * const * file_names)
 {
 	struct jr_code_debug_info rec;
-	size_t sret, len, size, flen;
+	size_t sret, len, size, flen = 0;
 	uint64_t addr;
-	const char *fn = file;
 	FILE *fp = agent;
 	int i;
 
@@ -405,7 +405,9 @@ jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
 		return -1;
 	}
 
-	flen = strlen(file) + 1;
+	for (i = 0; i < nr_lines; ++i) {
+	    flen += strlen(file_names[i]) + 1;
+	}
 
 	rec.p.id        = JIT_CODE_DEBUG_INFO;
 	size            = sizeof(rec);
@@ -421,7 +423,7 @@ jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
 	 * file[]   : source file name
 	 */
 	size += nr_lines * sizeof(struct debug_entry);
-	size += flen * nr_lines;
+	size += flen;
 	rec.p.total_size = size;
 
 	/*
@@ -452,7 +454,7 @@ jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
 		if (sret != 1)
 			goto error;
 
-		sret = fwrite_unlocked(fn, flen, 1, fp);
+		sret = fwrite_unlocked(file_names[i], strlen(file_names[i]) + 1, 1, fp);
 		if (sret != 1)
 			goto error;
 	}
diff --git a/tools/perf/jvmti/jvmti_agent.h b/tools/perf/jvmti/jvmti_agent.h
index fe32d8344a82..6ed82f6c06dd 100644
--- a/tools/perf/jvmti/jvmti_agent.h
+++ b/tools/perf/jvmti/jvmti_agent.h
@@ -14,6 +14,7 @@ typedef struct {
 	unsigned long	pc;
 	int		line_number;
 	int		discrim; /* discriminator -- 0 for now */
+	jmethodID	methodID;
 } jvmti_line_info_t;
 
 void *jvmti_open(void);
@@ -22,11 +23,9 @@ int   jvmti_write_code(void *agent, char const *symbol_name,
 		       uint64_t vma, void const *code,
 		       const unsigned int code_size);
 
-int   jvmti_write_debug_info(void *agent,
-		             uint64_t code,
-			     const char *file,
+int   jvmti_write_debug_info(void *agent, uint64_t code, int nr_lines,
 			     jvmti_line_info_t *li,
-			     int nr_lines);
+			     const char * const * file_names);
 
 #if defined(__cplusplus)
 }
diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c
index c62c9fc9a525..6add3e982614 100644
--- a/tools/perf/jvmti/libjvmti.c
+++ b/tools/perf/jvmti/libjvmti.c
@@ -47,6 +47,7 @@ do_get_line_numbers(jvmtiEnv *jvmti, void *pc, jmethodID m, jint bci,
 			tab[lines].pc = (unsigned long)pc;
 			tab[lines].line_number = loc_tab[i].line_number;
 			tab[lines].discrim = 0; /* not yet used */
+			tab[lines].methodID = m;
 			lines++;
 		} else {
 			break;
@@ -125,6 +126,99 @@ get_line_numbers(jvmtiEnv *jvmti, const void *compile_info, jvmti_line_info_t **
 	return JVMTI_ERROR_NONE;
 }
 
+static void
+copy_class_filename(const char * class_sign, const char * file_name, char * result, size_t max_length)
+{
+	/*
+	* Assume path name is class hierarchy, this is a common practice with Java programs
+	*/
+	if (*class_sign == 'L') {
+		int j, i = 0;
+		char *p = strrchr(class_sign, '/');
+		if (p) {
+			/* drop the 'L' prefix and copy up to the final '/' */
+			for (i = 0; i < (p - class_sign); i++)
+				result[i] = class_sign[i+1];
+		}
+		/*
+		* append file name, we use loops and not string ops to avoid modifying
+		* class_sign which is used later for the symbol name
+		*/
+		for (j = 0; i < (max_length - 1) && file_name && j < strlen(file_name); j++, i++)
+			result[i] = file_name[j];
+
+		result[i] = '\0';
+	} else {
+		/* fallback case */
+		size_t file_name_len = strlen(file_name);
+		strncpy(result, file_name, file_name_len < max_length ? file_name_len : max_length);
+	}
+}
+
+static jvmtiError
+get_source_filename(jvmtiEnv *jvmti, jmethodID methodID, char ** buffer)
+{
+	jvmtiError ret;
+	jclass decl_class;
+	char *file_name = NULL;
+	char *class_sign = NULL;
+	char fn[PATH_MAX];
+	size_t len;
+
+	ret = (*jvmti)->GetMethodDeclaringClass(jvmti, methodID, &decl_class);
+	if (ret != JVMTI_ERROR_NONE) {
+		print_error(jvmti, "GetMethodDeclaringClass", ret);
+		return ret;
+	}
+
+	ret = (*jvmti)->GetSourceFileName(jvmti, decl_class, &file_name);
+	if (ret != JVMTI_ERROR_NONE) {
+		print_error(jvmti, "GetSourceFileName", ret);
+		return ret;
+	}
+
+	ret = (*jvmti)->GetClassSignature(jvmti, decl_class, &class_sign, NULL);
+	if (ret != JVMTI_ERROR_NONE) {
+		print_error(jvmti, "GetClassSignature", ret);
+		goto free_file_name_error;
+	}
+
+	copy_class_filename(class_sign, file_name, fn, PATH_MAX);
+	len = strlen(fn);
+	*buffer = malloc((len + 1) * sizeof(char));
+	if (!*buffer) {
+		print_error(jvmti, "GetClassSignature", ret);
+		ret = JVMTI_ERROR_OUT_OF_MEMORY;
+		goto free_class_sign_error;
+	}
+	strcpy(*buffer, fn);
+	ret = JVMTI_ERROR_NONE;
+
+free_class_sign_error:
+	(*jvmti)->Deallocate(jvmti, (unsigned char *)class_sign);
+free_file_name_error:
+	(*jvmti)->Deallocate(jvmti, (unsigned char *)file_name);
+
+	return ret;
+}
+
+static jvmtiError
+fill_source_filenames(jvmtiEnv *jvmti, int nr_lines,
+		      const jvmti_line_info_t * line_tab,
+		      char ** file_names)
+{
+	int index;
+	jvmtiError ret;
+
+	for (index = 0; index < nr_lines; ++index) {
+		ret = get_source_filename(jvmti, line_tab[index].methodID, &(file_names[index]));
+		if (ret != JVMTI_ERROR_NONE)
+			return ret;
+	}
+
+	return JVMTI_ERROR_NONE;
+}
+
 static void JNICALL
 compiled_method_load_cb(jvmtiEnv *jvmti,
 			jmethodID method,
@@ -135,16 +229,18 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 			const void *compile_info)
 {
 	jvmti_line_info_t *line_tab = NULL;
+	char ** line_file_names = NULL;
 	jclass decl_class;
 	char *class_sign = NULL;
 	char *func_name = NULL;
 	char *func_sign = NULL;
-	char *file_name= NULL;
+	char *file_name = NULL;
 	char fn[PATH_MAX];
 	uint64_t addr = (uint64_t)(uintptr_t)code_addr;
 	jvmtiError ret;
 	int nr_lines = 0; /* in line_tab[] */
 	size_t len;
+	int output_debug_info = 0;
 
 	ret = (*jvmti)->GetMethodDeclaringClass(jvmti, method,
 						&decl_class);
@@ -158,6 +254,19 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 		if (ret != JVMTI_ERROR_NONE) {
 			warnx("jvmti: cannot get line table for method");
 			nr_lines = 0;
+		} else if (nr_lines > 0) {
+			line_file_names = malloc(sizeof(char*) * nr_lines);
+			if (!line_file_names) {
+				warnx("jvmti: cannot allocate space for line table method names");
+			} else {
+				memset(line_file_names, 0, sizeof(char*) * nr_lines);
+				ret = fill_source_filenames(jvmti, nr_lines, line_tab, line_file_names);
+				if (ret != JVMTI_ERROR_NONE) {
+					warnx("jvmti: fill_source_filenames failed");
+				} else {
+					output_debug_info = 1;
+				}
+			}
 		}
 	}
 
@@ -181,33 +290,14 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 		goto error;
 	}
 
-	/*
-	 * Assume path name is class hierarchy, this is a common practice with Java programs
-	 */
-	if (*class_sign == 'L') {
-		int j, i = 0;
-		char *p = strrchr(class_sign, '/');
-		if (p) {
-			/* drop the 'L' prefix and copy up to the final '/' */
-			for (i = 0; i < (p - class_sign); i++)
-				fn[i] = class_sign[i+1];
-		}
-		/*
-		 * append file name, we use loops and not string ops to avoid modifying
-		 * class_sign which is used later for the symbol name
-		 */
-		for (j = 0; i < (PATH_MAX - 1) && file_name && j < strlen(file_name); j++, i++)
-			fn[i] = file_name[j];
-		fn[i] = '\0';
-	} else {
-		/* fallback case */
-		strcpy(fn, file_name);
-	}
+	copy_class_filename(class_sign, file_name, fn, PATH_MAX);
+
 	/*
 	 * write source line info record if we have it
 	 */
-	if (jvmti_write_debug_info(jvmti_agent, addr, fn, line_tab, nr_lines))
-		warnx("jvmti: write_debug_info() failed");
+	if (output_debug_info)
+		if (jvmti_write_debug_info(jvmti_agent, addr, nr_lines, line_tab, (const char * const *) line_file_names))
+			warnx("jvmti: write_debug_info() failed");
 
 	len = strlen(func_name) + strlen(class_sign) + strlen(func_sign) + 2;
 	{
@@ -223,6 +313,13 @@ error:
 	(*jvmti)->Deallocate(jvmti, (unsigned char *)class_sign);
 	(*jvmti)->Deallocate(jvmti, (unsigned char *)file_name);
 	free(line_tab);
+	while (line_file_names && (nr_lines > 0)) {
+	    if (line_file_names[nr_lines - 1]) {
+	        free(line_file_names[nr_lines - 1]);
+	    }
+	    nr_lines -= 1;
+	}
+	free(line_file_names);
 }
 
 static void JNICALL

From 10b9baa701d5023897f70a4acb3bf0235da3dc4f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 28 Nov 2017 11:08:41 -0300
Subject: [PATCH 056/305] tools arch s390: Do not include header files from the
 kernel sources

Long ago we decided to be verbotten including files in the kernel git
sources from tools/ living source code, to avoid disturbing kernel
development (and perf's and other tools/) when, say, a kernel hacker
adds something, tests everything but tools/ and have tools/ build
broken.

This got broken recently by s/390, fix it by copying
arch/s390/include/uapi/asm/perf_regs.h to tools/arch/s390/include/uapi/asm/,
making this one be used by means of <asm/perf_regs.h> and updating
tools/perf/check_headers.sh to make sure we are notified when the
original changes, so that we can check if anything is needed on the
tooling side.

This would have been caught by the 'tarkpg' test entry in:

$ make -C tools/perf build-test

When run on a s/390 build system or container.

Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: f704ef44602f ("s390/perf: add support for perf_regs and libdw")
Link: https://lkml.kernel.org/n/tip-n57139ic0v9uffx8wdqi3d8a@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/s390/include/uapi/asm/perf_regs.h | 44 ++++++++++++++++++++
 tools/perf/arch/s390/include/perf_regs.h     |  2 +-
 tools/perf/check-headers.sh                  |  1 +
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 tools/arch/s390/include/uapi/asm/perf_regs.h

diff --git a/tools/arch/s390/include/uapi/asm/perf_regs.h b/tools/arch/s390/include/uapi/asm/perf_regs.h
new file mode 100644
index 000000000000..d17dd9e5d516
--- /dev/null
+++ b/tools/arch/s390/include/uapi/asm/perf_regs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_S390_PERF_REGS_H
+#define _ASM_S390_PERF_REGS_H
+
+enum perf_event_s390_regs {
+	PERF_REG_S390_R0,
+	PERF_REG_S390_R1,
+	PERF_REG_S390_R2,
+	PERF_REG_S390_R3,
+	PERF_REG_S390_R4,
+	PERF_REG_S390_R5,
+	PERF_REG_S390_R6,
+	PERF_REG_S390_R7,
+	PERF_REG_S390_R8,
+	PERF_REG_S390_R9,
+	PERF_REG_S390_R10,
+	PERF_REG_S390_R11,
+	PERF_REG_S390_R12,
+	PERF_REG_S390_R13,
+	PERF_REG_S390_R14,
+	PERF_REG_S390_R15,
+	PERF_REG_S390_FP0,
+	PERF_REG_S390_FP1,
+	PERF_REG_S390_FP2,
+	PERF_REG_S390_FP3,
+	PERF_REG_S390_FP4,
+	PERF_REG_S390_FP5,
+	PERF_REG_S390_FP6,
+	PERF_REG_S390_FP7,
+	PERF_REG_S390_FP8,
+	PERF_REG_S390_FP9,
+	PERF_REG_S390_FP10,
+	PERF_REG_S390_FP11,
+	PERF_REG_S390_FP12,
+	PERF_REG_S390_FP13,
+	PERF_REG_S390_FP14,
+	PERF_REG_S390_FP15,
+	PERF_REG_S390_MASK,
+	PERF_REG_S390_PC,
+
+	PERF_REG_S390_MAX
+};
+
+#endif /* _ASM_S390_PERF_REGS_H */
diff --git a/tools/perf/arch/s390/include/perf_regs.h b/tools/perf/arch/s390/include/perf_regs.h
index d2df54a6bc5a..bcfbaed78cc2 100644
--- a/tools/perf/arch/s390/include/perf_regs.h
+++ b/tools/perf/arch/s390/include/perf_regs.h
@@ -3,7 +3,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <../../../../arch/s390/include/uapi/asm/perf_regs.h>
+#include <asm/perf_regs.h>
 
 void perf_regs_load(u64 *regs);
 
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index 6db9d809fe97..3e64f10b6d66 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -21,6 +21,7 @@ arch/x86/include/asm/cpufeatures.h
 arch/arm/include/uapi/asm/perf_regs.h
 arch/arm64/include/uapi/asm/perf_regs.h
 arch/powerpc/include/uapi/asm/perf_regs.h
+arch/s390/include/uapi/asm/perf_regs.h
 arch/x86/include/uapi/asm/perf_regs.h
 arch/x86/include/uapi/asm/kvm.h
 arch/x86/include/uapi/asm/kvm_perf.h

From ca26cffa4e4aaeb09bb9e308f95c7835cb149248 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 4 Dec 2017 13:08:47 -0300
Subject: [PATCH 057/305] x86/asm: Allow again using asm.h when building for
 the 'bpf' clang target

Up to f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")
we were able to use x86 headers to build to the 'bpf' clang target, as
done by the BPF code in tools/perf/.

With that commit, we ended up with following failure for 'perf test LLVM', this
is because "clang ... -target bpf ..." fails since 4.0 does not have bpf inline
asm support and 6.0 does not recognize the register 'esp', fix it by guarding
that part with an #ifndef __BPF__, that is defined by clang when building to
the "bpf" target.

  # perf test -v LLVM
  37: LLVM search and compile                               :
  37.1: Basic BPF llvm compile                              :
  --- start ---
  test child forked, pid 25526
  Kernel build dir is set to /lib/modules/4.14.0+/build
  set env: KBUILD_DIR=/lib/modules/4.14.0+/build
  unset env: KBUILD_OPTS
  include option is set to  -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated  -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h
  set env: NR_CPUS=4
  set env: LINUX_VERSION_CODE=0x40e00
  set env: CLANG_EXEC=/usr/local/bin/clang
  set env: CLANG_OPTIONS=-xc
  set env: KERNEL_INC_OPTIONS= -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated  -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h
  set env: WORKING_DIR=/lib/modules/4.14.0+/build
  set env: CLANG_SOURCE=-
  llvm compiling command template: echo '/*
   * bpf-script-example.c
   * Test basic LLVM building
   */
  #ifndef LINUX_VERSION_CODE
  # error Need LINUX_VERSION_CODE
  # error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig'
  #endif
  #define BPF_ANY 0
  #define BPF_MAP_TYPE_ARRAY 2
  #define BPF_FUNC_map_lookup_elem 1
  #define BPF_FUNC_map_update_elem 2

  static void *(*bpf_map_lookup_elem)(void *map, void *key) =
	  (void *) BPF_FUNC_map_lookup_elem;
  static void *(*bpf_map_update_elem)(void *map, void *key, void *value, int flags) =
	  (void *) BPF_FUNC_map_update_elem;

  struct bpf_map_def {
	  unsigned int type;
	  unsigned int key_size;
	  unsigned int value_size;
	  unsigned int max_entries;
  };

  #define SEC(NAME) __attribute__((section(NAME), used))
  struct bpf_map_def SEC("maps") flip_table = {
	  .type = BPF_MAP_TYPE_ARRAY,
	  .key_size = sizeof(int),
	  .value_size = sizeof(int),
	  .max_entries = 1,
  };

  SEC("func=SyS_epoll_wait")
  int bpf_func__SyS_epoll_wait(void *ctx)
  {
	  int ind =0;
	  int *flag = bpf_map_lookup_elem(&flip_table, &ind);
	  int new_flag;
	  if (!flag)
		  return 0;
	  /* flip flag and store back */
	  new_flag = !*flag;
	  bpf_map_update_elem(&flip_table, &ind, &new_flag, BPF_ANY);
	  return new_flag;
  }
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  ' | $CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS -DLINUX_VERSION_CODE=$LINUX_VERSION_CODE $CLANG_OPTIONS $KERNEL_INC_OPTIONS -Wno-unused-value -Wno-pointer-sign -working-directory $WORKING_DIR -c "$CLANG_SOURCE" -target bpf -O2 -o -
  test child finished with 0
  ---- end ----
  LLVM search and compile subtest 0: Ok
  37.2: kbuild searching                                    :
  --- start ---
  test child forked, pid 25950
  Kernel build dir is set to /lib/modules/4.14.0+/build
  set env: KBUILD_DIR=/lib/modules/4.14.0+/build
  unset env: KBUILD_OPTS
  include option is set to  -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated  -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h
  set env: NR_CPUS=4
  set env: LINUX_VERSION_CODE=0x40e00
  set env: CLANG_EXEC=/usr/local/bin/clang
  set env: CLANG_OPTIONS=-xc
  set env: KERNEL_INC_OPTIONS= -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated  -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h
  set env: WORKING_DIR=/lib/modules/4.14.0+/build
  set env: CLANG_SOURCE=-
  llvm compiling command template: echo '/*
   * bpf-script-test-kbuild.c
   * Test include from kernel header
   */
  #ifndef LINUX_VERSION_CODE
  # error Need LINUX_VERSION_CODE
  # error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig'
  #endif
  #define SEC(NAME) __attribute__((section(NAME), used))

  #include <uapi/linux/fs.h>
  #include <uapi/asm/ptrace.h>

  SEC("func=vfs_llseek")
  int bpf_func__vfs_llseek(void *ctx)
  {
	  return 0;
  }

  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  ' | $CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS -DLINUX_VERSION_CODE=$LINUX_VERSION_CODE $CLANG_OPTIONS $KERNEL_INC_OPTIONS -Wno-unused-value -Wno-pointer-sign -working-directory $WORKING_DIR -c "$CLANG_SOURCE" -target bpf -O2 -o -
  In file included from <stdin>:12:
  In file included from /home/acme/git/linux/arch/x86/include/uapi/asm/ptrace.h:5:
  In file included from /home/acme/git/linux/include/linux/compiler.h:242:
  In file included from /home/acme/git/linux/arch/x86/include/asm/barrier.h:5:
  In file included from /home/acme/git/linux/arch/x86/include/asm/alternative.h:10:
  /home/acme/git/linux/arch/x86/include/asm/asm.h:145:50: error: unknown register name 'esp' in asm
  register unsigned long current_stack_pointer asm(_ASM_SP);
                                                   ^
  /home/acme/git/linux/arch/x86/include/asm/asm.h:44:18: note: expanded from macro '_ASM_SP'
  #define _ASM_SP         __ASM_REG(sp)
                          ^
  /home/acme/git/linux/arch/x86/include/asm/asm.h:27:32: note: expanded from macro '__ASM_REG'
  #define __ASM_REG(reg)         __ASM_SEL_RAW(e##reg, r##reg)
                                 ^
  /home/acme/git/linux/arch/x86/include/asm/asm.h:18:29: note: expanded from macro '__ASM_SEL_RAW'
  # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
                              ^
  /home/acme/git/linux/arch/x86/include/asm/asm.h:11:32: note: expanded from macro '__ASM_FORM_RAW'
  # define __ASM_FORM_RAW(x)     #x
                                 ^
  <scratch space>:4:1: note: expanded from here
  "esp"
  ^
  1 error generated.
  ERROR:	unable to compile -
  Hint:	Check error message shown above.
  Hint:	You can also pre-compile it into .o using:
     		  clang -target bpf -O2 -c -
     	  with proper -I and -D options.
  Failed to compile test case: 'kbuild searching'
  test child finished with -1
  ---- end ----
  LLVM search and compile subtest 1: FAILED!

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Yonghong Song <yhs@fb.com>
Link: https://lkml.kernel.org/r/20171128175948.GL3298@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/x86/include/asm/asm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 219faaec51df..386a6900e206 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -136,6 +136,7 @@
 #endif
 
 #ifndef __ASSEMBLY__
+#ifndef __BPF__
 /*
  * This output constraint should be used for any inline asm which has a "call"
  * instruction.  Otherwise the asm may be inserted before the frame pointer
@@ -145,5 +146,6 @@
 register unsigned long current_stack_pointer asm(_ASM_SP);
 #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #endif
+#endif
 
 #endif /* _ASM_X86_ASM_H */

From bb422a738f6566f7439cd347d54e321e4fe92a9f Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 18 Dec 2017 20:31:41 +0900
Subject: [PATCH 058/305] mm,vmscan: Make unregister_shrinker() no-op if
 register_shrinker() failed.

Syzbot caught an oops at unregister_shrinker() because combination of
commit 1d3d4437eae1bb29 ("vmscan: per-node deferred work") and fault
injection made register_shrinker() fail and the caller of
register_shrinker() did not check for failure.

----------
[  554.881422] FAULT_INJECTION: forcing a failure.
[  554.881422] name failslab, interval 1, probability 0, space 0, times 0
[  554.881438] CPU: 1 PID: 13231 Comm: syz-executor1 Not tainted 4.14.0-rc8+ #82
[  554.881443] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[  554.881445] Call Trace:
[  554.881459]  dump_stack+0x194/0x257
[  554.881474]  ? arch_local_irq_restore+0x53/0x53
[  554.881486]  ? find_held_lock+0x35/0x1d0
[  554.881507]  should_fail+0x8c0/0xa40
[  554.881522]  ? fault_create_debugfs_attr+0x1f0/0x1f0
[  554.881537]  ? check_noncircular+0x20/0x20
[  554.881546]  ? find_next_zero_bit+0x2c/0x40
[  554.881560]  ? ida_get_new_above+0x421/0x9d0
[  554.881577]  ? find_held_lock+0x35/0x1d0
[  554.881594]  ? __lock_is_held+0xb6/0x140
[  554.881628]  ? check_same_owner+0x320/0x320
[  554.881634]  ? lock_downgrade+0x990/0x990
[  554.881649]  ? find_held_lock+0x35/0x1d0
[  554.881672]  should_failslab+0xec/0x120
[  554.881684]  __kmalloc+0x63/0x760
[  554.881692]  ? lock_downgrade+0x990/0x990
[  554.881712]  ? register_shrinker+0x10e/0x2d0
[  554.881721]  ? trace_event_raw_event_module_request+0x320/0x320
[  554.881737]  register_shrinker+0x10e/0x2d0
[  554.881747]  ? prepare_kswapd_sleep+0x1f0/0x1f0
[  554.881755]  ? _down_write_nest_lock+0x120/0x120
[  554.881765]  ? memcpy+0x45/0x50
[  554.881785]  sget_userns+0xbcd/0xe20
(...snipped...)
[  554.898693] kasan: CONFIG_KASAN_INLINE enabled
[  554.898724] kasan: GPF could be caused by NULL-ptr deref or user memory access
[  554.898732] general protection fault: 0000 [#1] SMP KASAN
[  554.898737] Dumping ftrace buffer:
[  554.898741]    (ftrace buffer empty)
[  554.898743] Modules linked in:
[  554.898752] CPU: 1 PID: 13231 Comm: syz-executor1 Not tainted 4.14.0-rc8+ #82
[  554.898755] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[  554.898760] task: ffff8801d1dbe5c0 task.stack: ffff8801c9e38000
[  554.898772] RIP: 0010:__list_del_entry_valid+0x7e/0x150
[  554.898775] RSP: 0018:ffff8801c9e3f108 EFLAGS: 00010246
[  554.898780] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000
[  554.898784] RDX: 0000000000000000 RSI: ffff8801c53c6f98 RDI: ffff8801c53c6fa0
[  554.898788] RBP: ffff8801c9e3f120 R08: 1ffff100393c7d55 R09: 0000000000000004
[  554.898791] R10: ffff8801c9e3ef70 R11: 0000000000000000 R12: 0000000000000000
[  554.898795] R13: dffffc0000000000 R14: 1ffff100393c7e45 R15: ffff8801c53c6f98
[  554.898800] FS:  0000000000000000(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
[  554.898804] CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
[  554.898807] CR2: 00000000dbc23000 CR3: 00000001c7269000 CR4: 00000000001406e0
[  554.898813] DR0: 0000000020000000 DR1: 0000000020000000 DR2: 0000000000000000
[  554.898816] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600
[  554.898818] Call Trace:
[  554.898828]  unregister_shrinker+0x79/0x300
[  554.898837]  ? perf_trace_mm_vmscan_writepage+0x750/0x750
[  554.898844]  ? down_write+0x87/0x120
[  554.898851]  ? deactivate_super+0x139/0x1b0
[  554.898857]  ? down_read+0x150/0x150
[  554.898864]  ? check_same_owner+0x320/0x320
[  554.898875]  deactivate_locked_super+0x64/0xd0
[  554.898883]  deactivate_super+0x141/0x1b0
----------

Since allowing register_shrinker() callers to call unregister_shrinker()
when register_shrinker() failed can simplify error recovery path, this
patch makes unregister_shrinker() no-op when register_shrinker() failed.
Also, reset shrinker->nr_deferred in case unregister_shrinker() was
by error called twice.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Glauber Costa <glauber@scylladb.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/vmscan.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c02c850ea349..47d5ced51f2d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -297,10 +297,13 @@ EXPORT_SYMBOL(register_shrinker);
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
+	if (!shrinker->nr_deferred)
+		return;
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
 	up_write(&shrinker_rwsem);
 	kfree(shrinker->nr_deferred);
+	shrinker->nr_deferred = NULL;
 }
 EXPORT_SYMBOL(unregister_shrinker);
 

From 9ee332d99e4d5a97548943b81c54668450ce641b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 18 Dec 2017 15:05:07 -0500
Subject: [PATCH 059/305] sget(): handle failures of register_shrinker()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/super.c b/fs/super.c
index 7ff1349609e4..06bd25d90ba5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -517,7 +517,11 @@ retry:
 	hlist_add_head(&s->s_instances, &type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(type);
-	register_shrinker(&s->s_shrink);
+	err = register_shrinker(&s->s_shrink);
+	if (err) {
+		deactivate_locked_super(s);
+		s = ERR_PTR(err);
+	}
 	return s;
 }
 

From 8b7e9d9e2d8b4de6f0d5d7a5fc63f48b1fbcf4d4 Mon Sep 17 00:00:00 2001
From: Anthony Kim <anthony.kim@hideep.com>
Date: Mon, 18 Dec 2017 11:50:48 -0800
Subject: [PATCH 060/305] Input: hideep - fix compile error due to missing
 include file

gpiod_() API requires including "linux/gpio/consumer.h". Also, we are not
using the legacy API nor the static board files descriptions, so no need to
include gpio.h nor gpio/machine.h.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Anthony Kim <anthony.kim@hideep.com>
Patchwork-Id: 10094831
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/hideep.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/input/touchscreen/hideep.c b/drivers/input/touchscreen/hideep.c
index fc080a7c2e1f..f1cd4dd9a4a3 100644
--- a/drivers/input/touchscreen/hideep.c
+++ b/drivers/input/touchscreen/hideep.c
@@ -10,8 +10,7 @@
 #include <linux/of.h>
 #include <linux/firmware.h>
 #include <linux/delay.h>
-#include <linux/gpio.h>
-#include <linux/gpio/machine.h>
+#include <linux/gpio/consumer.h>
 #include <linux/i2c.h>
 #include <linux/acpi.h>
 #include <linux/interrupt.h>

From 81b6c999897919d5a16fedc018fe375dbab091c5 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Wed, 13 Dec 2017 14:21:37 +0100
Subject: [PATCH 061/305] scsi: core: check for device state in
 __scsi_remove_target()

As it turned out device_get() doesn't use kref_get_unless_zero(), so we
will be always getting a device pointer.  Consequently, we need to check
for the device state in __scsi_remove_target() to avoid tripping over
deleted objects.

Fixes: fbce4d97fd43 ("scsi: fixup kernel warning during rmmod()")
Reported-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Ewan D. Milne <emilne@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_sysfs.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index a9996c16f4ae..26ce17178401 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -1415,7 +1415,10 @@ static void __scsi_remove_target(struct scsi_target *starget)
 		 * check.
 		 */
 		if (sdev->channel != starget->channel ||
-		    sdev->id != starget->id ||
+		    sdev->id != starget->id)
+			continue;
+		if (sdev->sdev_state == SDEV_DEL ||
+		    sdev->sdev_state == SDEV_CANCEL ||
 		    !get_device(&sdev->sdev_gendev))
 			continue;
 		spin_unlock_irqrestore(shost->host_lock, flags);

From 6454b3bdd138dfc640deb5e7b9a0668fca2d55dd Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Dec 2017 15:13:44 -0600
Subject: [PATCH 062/305] x86/stacktrace: Make zombie stack traces reliable

Commit:

  1959a60182f4 ("x86/dumpstack: Pin the target stack when dumping it")

changed the behavior of stack traces for zombies.  Before that commit,
/proc/<pid>/stack reported the last execution path of the zombie before
it died:

  [<ffffffff8105b877>] do_exit+0x6f7/0xa80
  [<ffffffff8105bc79>] do_group_exit+0x39/0xa0
  [<ffffffff8105bcf0>] __wake_up_parent+0x0/0x30
  [<ffffffff8152dd09>] system_call_fastpath+0x16/0x1b
  [<00007fd128f9c4f9>] 0x7fd128f9c4f9
  [<ffffffffffffffff>] 0xffffffffffffffff

After the commit, it just reports an empty stack trace.

The new behavior is actually probably more correct.  If the stack
refcount has gone down to zero, then the task has already gone through
do_exit() and isn't going to run anymore.  The stack could be freed at
any time and is basically gone, so reporting an empty stack makes sense.

However, save_stack_trace_tsk_reliable() treats such a missing stack
condition as an error.  That can cause livepatch transition stalls if
there are any unreaped zombies.  Instead, just treat it as a reliable,
empty stack.

Reported-and-tested-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Fixes: af085d9084b4 ("stacktrace/x86: add function for detecting reliable stack traces")
Link: http://lkml.kernel.org/r/e4b09e630e99d0c1080528f0821fc9d9dbaeea82.1513631620.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/stacktrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 77835bc021c7..20161ef53537 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -164,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk,
 {
 	int ret;
 
+	/*
+	 * If the task doesn't have a stack (e.g., a zombie), the stack is
+	 * "reliably" empty.
+	 */
 	if (!try_get_task_stack(tsk))
-		return -EINVAL;
+		return 0;
 
 	ret = __save_stack_trace_reliable(trace, tsk);
 

From eac6a3639decefcc8eb0941dd3cebe79993670ad Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Thu, 7 Dec 2017 16:58:59 +0100
Subject: [PATCH 063/305] ARM: dts: sun8i: a711: Reinstate the PMIC compatible

When we added the regulator support in commit 90c5d7cdae64 ("ARM: dts:
sun8i: a711: Add regulator support"), we also dropped the PMIC's
compatible. Since it's not in the PMIC DTSI, unlike most other PMIC
DTSI, it obviously wasn't probing anymore.

Re-add it so that everything works again.

Fixes: 90c5d7cdae64 ("ARM: dts: sun8i: a711: Add regulator support")
Reviewed-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts
index 98715538932f..a021ee6da396 100644
--- a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts
+++ b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts
@@ -146,6 +146,7 @@
 	status = "okay";
 
 	axp81x: pmic@3a3 {
+		compatible = "x-powers,axp813";
 		reg = <0x3a3>;
 		interrupt-parent = <&r_intc>;
 		interrupts = <0 IRQ_TYPE_LEVEL_LOW>;

From 3920bb713038810f25770e7545b79f204685c8f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZ=20Lin=20=28=E6=9E=97=E4=B8=8A=E6=99=BA=29?=
 <sz.lin@moxa.com>
Date: Tue, 19 Dec 2017 17:40:32 +0800
Subject: [PATCH 064/305] USB: serial: option: adding support for YUGA
 CLM920-NC5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds support for YUGA CLM920-NC5 PID 0x9625 USB modem to option
driver.

Interface layout:
0: QCDM/DIAG
1: ADB
2: MODEM
3: AT
4: RMNET

Signed-off-by: Taiyi Wu <taiyity.wu@moxa.com>
Signed-off-by: SZ Lin (林上智) <sz.lin@moxa.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/option.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index b02fb576b856..b6320e3be429 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -233,6 +233,8 @@ static void option_instat_callback(struct urb *urb);
 /* These Quectel products use Qualcomm's vendor ID */
 #define QUECTEL_PRODUCT_UC20			0x9003
 #define QUECTEL_PRODUCT_UC15			0x9090
+/* These Yuga products use Qualcomm's vendor ID */
+#define YUGA_PRODUCT_CLM920_NC5			0x9625
 
 #define QUECTEL_VENDOR_ID			0x2c7c
 /* These Quectel products use Quectel's vendor ID */
@@ -680,6 +682,10 @@ static const struct option_blacklist_info cinterion_rmnet2_blacklist = {
 	.reserved = BIT(4) | BIT(5),
 };
 
+static const struct option_blacklist_info yuga_clm920_nc5_blacklist = {
+	.reserved = BIT(1) | BIT(4),
+};
+
 static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) },
 	{ USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) },
@@ -1184,6 +1190,9 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC15)},
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC20),
 	  .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
+	/* Yuga products use Qualcomm vendor ID */
+	{ USB_DEVICE(QUALCOMM_VENDOR_ID, YUGA_PRODUCT_CLM920_NC5),
+	  .driver_info = (kernel_ulong_t)&yuga_clm920_nc5_blacklist },
 	/* Quectel products using Quectel vendor ID */
 	{ USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC21),
 	  .driver_info = (kernel_ulong_t)&net_intf4_blacklist },

From 07b9f12864d16c3a861aef4817eb1efccbc5d0e6 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Tue, 19 Dec 2017 11:14:42 +0200
Subject: [PATCH 065/305] USB: Fix off by one in type-specific length check of
 BOS SSP capability

USB 3.1 devices are not detected as 3.1 capable since 4.15-rc3 due to a
off by one in commit 81cf4a45360f ("USB: core: Add type-specific length
check of BOS descriptors")

It uses USB_DT_USB_SSP_CAP_SIZE() to get SSP capability size which takes
the zero based SSAC as argument, not the actual count of sublink speed
attributes.

USB3 spec 9.6.2.5 says "The number of Sublink Speed Attributes = SSAC + 1."

The type-specific length check patch was added to stable and needs to be
fixed there as well

Fixes: 81cf4a45360f ("USB: core: Add type-specific length check of BOS descriptors")
Cc: linux-stable <stable@vger.kernel.org>
CC: Masakazu Mokuno <masakazu.mokuno@gmail.com>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index 78e92d29f8d9..c821b4b9647e 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -1007,7 +1007,7 @@ int usb_get_bos_descriptor(struct usb_device *dev)
 		case USB_SSP_CAP_TYPE:
 			ssp_cap = (struct usb_ssp_cap_descriptor *)buffer;
 			ssac = (le32_to_cpu(ssp_cap->bmAttributes) &
-				USB_SSP_SUBLINK_SPEED_ATTRIBS) + 1;
+				USB_SSP_SUBLINK_SPEED_ATTRIBS);
 			if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac))
 				dev->bos->ssp_cap = ssp_cap;
 			break;

From 8272d099d05f7ab2776cf56a2ab9f9443be18907 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Mon, 18 Dec 2017 17:24:22 -0700
Subject: [PATCH 066/305] usbip: vhci: stop printing kernel pointer addresses
 in messages

Remove and/or change debug, info. and error messages to not print
kernel pointer addresses.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/vhci_hcd.c | 10 ----------
 drivers/usb/usbip/vhci_rx.c  | 23 +++++++++++------------
 drivers/usb/usbip/vhci_tx.c  |  3 ++-
 3 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c
index 6b3278c4b72a..9efab3dc3734 100644
--- a/drivers/usb/usbip/vhci_hcd.c
+++ b/drivers/usb/usbip/vhci_hcd.c
@@ -656,9 +656,6 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
 	struct vhci_device *vdev;
 	unsigned long flags;
 
-	usbip_dbg_vhci_hc("enter, usb_hcd %p urb %p mem_flags %d\n",
-			  hcd, urb, mem_flags);
-
 	if (portnum > VHCI_HC_PORTS) {
 		pr_err("invalid port number %d\n", portnum);
 		return -ENODEV;
@@ -822,8 +819,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 	struct vhci_device *vdev;
 	unsigned long flags;
 
-	pr_info("dequeue a urb %p\n", urb);
-
 	spin_lock_irqsave(&vhci->lock, flags);
 
 	priv = urb->hcpriv;
@@ -851,7 +846,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 		/* tcp connection is closed */
 		spin_lock(&vdev->priv_lock);
 
-		pr_info("device %p seems to be disconnected\n", vdev);
 		list_del(&priv->list);
 		kfree(priv);
 		urb->hcpriv = NULL;
@@ -863,8 +857,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 		 * vhci_rx will receive RET_UNLINK and give back the URB.
 		 * Otherwise, we give back it here.
 		 */
-		pr_info("gives back urb %p\n", urb);
-
 		usb_hcd_unlink_urb_from_ep(hcd, urb);
 
 		spin_unlock_irqrestore(&vhci->lock, flags);
@@ -892,8 +884,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 
 		unlink->unlink_seqnum = priv->seqnum;
 
-		pr_info("device %p seems to be still connected\n", vdev);
-
 		/* send cmd_unlink and try to cancel the pending URB in the
 		 * peer */
 		list_add_tail(&unlink->list, &vdev->unlink_tx);
diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c
index 90577e8b2282..112ebb90d8c9 100644
--- a/drivers/usb/usbip/vhci_rx.c
+++ b/drivers/usb/usbip/vhci_rx.c
@@ -23,24 +23,23 @@ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum)
 		urb = priv->urb;
 		status = urb->status;
 
-		usbip_dbg_vhci_rx("find urb %p vurb %p seqnum %u\n",
-				urb, priv, seqnum);
+		usbip_dbg_vhci_rx("find urb seqnum %u\n", seqnum);
 
 		switch (status) {
 		case -ENOENT:
 			/* fall through */
 		case -ECONNRESET:
-			dev_info(&urb->dev->dev,
-				 "urb %p was unlinked %ssynchronuously.\n", urb,
-				 status == -ENOENT ? "" : "a");
+			dev_dbg(&urb->dev->dev,
+				 "urb seq# %u was unlinked %ssynchronuously\n",
+				 seqnum, status == -ENOENT ? "" : "a");
 			break;
 		case -EINPROGRESS:
 			/* no info output */
 			break;
 		default:
-			dev_info(&urb->dev->dev,
-				 "urb %p may be in a error, status %d\n", urb,
-				 status);
+			dev_dbg(&urb->dev->dev,
+				 "urb seq# %u may be in a error, status %d\n",
+				 seqnum, status);
 		}
 
 		list_del(&priv->list);
@@ -67,8 +66,8 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
 	spin_unlock_irqrestore(&vdev->priv_lock, flags);
 
 	if (!urb) {
-		pr_err("cannot find a urb of seqnum %u\n", pdu->base.seqnum);
-		pr_info("max seqnum %d\n",
+		pr_err("cannot find a urb of seqnum %u max seqnum %d\n",
+			pdu->base.seqnum,
 			atomic_read(&vhci_hcd->seqnum));
 		usbip_event_add(ud, VDEV_EVENT_ERROR_TCP);
 		return;
@@ -91,7 +90,7 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
 	if (usbip_dbg_flag_vhci_rx)
 		usbip_dump_urb(urb);
 
-	usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
+	usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum);
 
 	spin_lock_irqsave(&vhci->lock, flags);
 	usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb);
@@ -158,7 +157,7 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev,
 		pr_info("the urb (seqnum %d) was already given back\n",
 			pdu->base.seqnum);
 	} else {
-		usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
+		usbip_dbg_vhci_rx("now giveback urb %d\n", pdu->base.seqnum);
 
 		/* If unlink is successful, status is -ECONNRESET */
 		urb->status = pdu->u.ret_unlink.status;
diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c
index d625a2ff4b71..9aed15a358b7 100644
--- a/drivers/usb/usbip/vhci_tx.c
+++ b/drivers/usb/usbip/vhci_tx.c
@@ -69,7 +69,8 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev)
 		memset(&msg, 0, sizeof(msg));
 		memset(&iov, 0, sizeof(iov));
 
-		usbip_dbg_vhci_tx("setup txdata urb %p\n", urb);
+		usbip_dbg_vhci_tx("setup txdata urb seqnum %lu\n",
+				  priv->seqnum);
 
 		/* 1. setup usbip_header */
 		setup_cmd_submit_pdu(&pdu_header, urb);

From 248a22044366f588d46754c54dfe29ffe4f8b4df Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Mon, 18 Dec 2017 17:23:37 -0700
Subject: [PATCH 067/305] usbip: stub: stop printing kernel pointer addresses
 in messages

Remove and/or change debug, info. and error messages to not print
kernel pointer addresses.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub_main.c | 5 +++--
 drivers/usb/usbip/stub_rx.c   | 7 ++-----
 drivers/usb/usbip/stub_tx.c   | 6 +++---
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
index 4f48b306713f..c31c8402a0c5 100644
--- a/drivers/usb/usbip/stub_main.c
+++ b/drivers/usb/usbip/stub_main.c
@@ -237,11 +237,12 @@ void stub_device_cleanup_urbs(struct stub_device *sdev)
 	struct stub_priv *priv;
 	struct urb *urb;
 
-	dev_dbg(&sdev->udev->dev, "free sdev %p\n", sdev);
+	dev_dbg(&sdev->udev->dev, "Stub device cleaning up urbs\n");
 
 	while ((priv = stub_priv_pop(sdev))) {
 		urb = priv->urb;
-		dev_dbg(&sdev->udev->dev, "free urb %p\n", urb);
+		dev_dbg(&sdev->udev->dev, "free urb seqnum %lu\n",
+			priv->seqnum);
 		usb_kill_urb(urb);
 
 		kmem_cache_free(stub_priv_cache, priv);
diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
index 493ac2928391..2f29be474098 100644
--- a/drivers/usb/usbip/stub_rx.c
+++ b/drivers/usb/usbip/stub_rx.c
@@ -211,9 +211,6 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
 		if (priv->seqnum != pdu->u.cmd_unlink.seqnum)
 			continue;
 
-		dev_info(&priv->urb->dev->dev, "unlink urb %p\n",
-			 priv->urb);
-
 		/*
 		 * This matched urb is not completed yet (i.e., be in
 		 * flight in usb hcd hardware/driver). Now we are
@@ -252,8 +249,8 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
 		ret = usb_unlink_urb(priv->urb);
 		if (ret != -EINPROGRESS)
 			dev_err(&priv->urb->dev->dev,
-				"failed to unlink a urb %p, ret %d\n",
-				priv->urb, ret);
+				"failed to unlink a urb # %lu, ret %d\n",
+				priv->seqnum, ret);
 
 		return 0;
 	}
diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c
index 53172b1f6257..f0ec41a50cbc 100644
--- a/drivers/usb/usbip/stub_tx.c
+++ b/drivers/usb/usbip/stub_tx.c
@@ -88,7 +88,7 @@ void stub_complete(struct urb *urb)
 	/* link a urb to the queue of tx. */
 	spin_lock_irqsave(&sdev->priv_lock, flags);
 	if (sdev->ud.tcp_socket == NULL) {
-		usbip_dbg_stub_tx("ignore urb for closed connection %p", urb);
+		usbip_dbg_stub_tx("ignore urb for closed connection\n");
 		/* It will be freed in stub_device_cleanup_urbs(). */
 	} else if (priv->unlinking) {
 		stub_enqueue_ret_unlink(sdev, priv->seqnum, urb->status);
@@ -190,8 +190,8 @@ static int stub_send_ret_submit(struct stub_device *sdev)
 
 		/* 1. setup usbip_header */
 		setup_ret_submit_pdu(&pdu_header, urb);
-		usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n",
-				  pdu_header.base.seqnum, urb);
+		usbip_dbg_stub_tx("setup txdata seqnum: %d\n",
+				  pdu_header.base.seqnum);
 		usbip_header_correct_endian(&pdu_header, 1);
 
 		iov[iovnum].iov_base = &pdu_header;

From 90120d15f4c397272aaf41077960a157fc4212bf Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Fri, 15 Dec 2017 10:50:09 -0700
Subject: [PATCH 068/305] usbip: prevent leaking socket pointer address in
 messages

usbip driver is leaking socket pointer address in messages. Remove
the messages that aren't useful and print sockfd in the ones that
are useful for debugging.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub_dev.c     |  3 +--
 drivers/usb/usbip/usbip_common.c | 16 +++++-----------
 drivers/usb/usbip/vhci_hcd.c     |  2 +-
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c
index a3df8ee82faf..e31a6f204397 100644
--- a/drivers/usb/usbip/stub_dev.c
+++ b/drivers/usb/usbip/stub_dev.c
@@ -149,8 +149,7 @@ static void stub_shutdown_connection(struct usbip_device *ud)
 	 * step 1?
 	 */
 	if (ud->tcp_socket) {
-		dev_dbg(&sdev->udev->dev, "shutdown tcp_socket %p\n",
-			ud->tcp_socket);
+		dev_dbg(&sdev->udev->dev, "shutdown sockfd %d\n", ud->sockfd);
 		kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
 	}
 
diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c
index f7978933b402..7b219d9109b4 100644
--- a/drivers/usb/usbip/usbip_common.c
+++ b/drivers/usb/usbip/usbip_common.c
@@ -317,26 +317,20 @@ int usbip_recv(struct socket *sock, void *buf, int size)
 	struct msghdr msg = {.msg_flags = MSG_NOSIGNAL};
 	int total = 0;
 
+	if (!sock || !buf || !size)
+		return -EINVAL;
+
 	iov_iter_kvec(&msg.msg_iter, READ|ITER_KVEC, &iov, 1, size);
 
 	usbip_dbg_xmit("enter\n");
 
-	if (!sock || !buf || !size) {
-		pr_err("invalid arg, sock %p buff %p size %d\n", sock, buf,
-		       size);
-		return -EINVAL;
-	}
-
 	do {
-		int sz = msg_data_left(&msg);
+		msg_data_left(&msg);
 		sock->sk->sk_allocation = GFP_NOIO;
 
 		result = sock_recvmsg(sock, &msg, MSG_WAITALL);
-		if (result <= 0) {
-			pr_debug("receive sock %p buf %p size %u ret %d total %d\n",
-				 sock, buf + total, sz, result, total);
+		if (result <= 0)
 			goto err;
-		}
 
 		total += result;
 	} while (msg_data_left(&msg));
diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c
index 9efab3dc3734..c3e1008aa491 100644
--- a/drivers/usb/usbip/vhci_hcd.c
+++ b/drivers/usb/usbip/vhci_hcd.c
@@ -965,7 +965,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud)
 
 	/* need this? see stub_dev.c */
 	if (ud->tcp_socket) {
-		pr_debug("shutdown tcp_socket %p\n", ud->tcp_socket);
+		pr_debug("shutdown tcp_socket %d\n", ud->sockfd);
 		kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
 	}
 

From 10c90120930628e8b959bf58d4a0aaef3ae5d945 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Fri, 15 Dec 2017 10:05:15 -0700
Subject: [PATCH 069/305] usbip: stub_rx: fix static checker warning on
 unnecessary checks

Fix the following static checker warnings:

The patch c6688ef9f297: "usbip: fix stub_rx: harden CMD_SUBMIT path
to handle malicious input" from Dec 7, 2017, leads to the following
static checker warning:

    drivers/usb/usbip/stub_rx.c:346 get_pipe()
    warn: impossible condition
'(pdu->u.cmd_submit.transfer_buffer_length > ((~0 >> 1))) =>
(s32min-s32max > s32max)'
    drivers/usb/usbip/stub_rx.c:486 stub_recv_cmd_submit()
    warn: always true condition
'(pdu->u.cmd_submit.transfer_buffer_length <= ((~0 >> 1))) =>
(s32min-s32max <= s32max)'

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub_rx.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
index 2f29be474098..6c5a59313999 100644
--- a/drivers/usb/usbip/stub_rx.c
+++ b/drivers/usb/usbip/stub_rx.c
@@ -339,14 +339,6 @@ static int get_pipe(struct stub_device *sdev, struct usbip_header *pdu)
 
 	epd = &ep->desc;
 
-	/* validate transfer_buffer_length */
-	if (pdu->u.cmd_submit.transfer_buffer_length > INT_MAX) {
-		dev_err(&sdev->udev->dev,
-			"CMD_SUBMIT: -EMSGSIZE transfer_buffer_length %d\n",
-			pdu->u.cmd_submit.transfer_buffer_length);
-		return -1;
-	}
-
 	if (usb_endpoint_xfer_control(epd)) {
 		if (dir == USBIP_DIR_OUT)
 			return usb_sndctrlpipe(udev, epnum);
@@ -479,8 +471,7 @@ static void stub_recv_cmd_submit(struct stub_device *sdev,
 	}
 
 	/* allocate urb transfer buffer, if needed */
-	if (pdu->u.cmd_submit.transfer_buffer_length > 0 &&
-	    pdu->u.cmd_submit.transfer_buffer_length <= INT_MAX) {
+	if (pdu->u.cmd_submit.transfer_buffer_length > 0) {
 		priv->urb->transfer_buffer =
 			kzalloc(pdu->u.cmd_submit.transfer_buffer_length,
 				GFP_KERNEL);

From 544c4605acc5ae4afe7dd5914147947db182f2fb Mon Sep 17 00:00:00 2001
From: Juan Zea <juan.zea@qindel.com>
Date: Fri, 15 Dec 2017 10:21:20 +0100
Subject: [PATCH 070/305] usbip: fix usbip bind writing random string after
 command in match_busid

usbip bind writes commands followed by random string when writing to
match_busid attribute in sysfs, caused by using full variable size
instead of string length.

Signed-off-by: Juan Zea <juan.zea@qindel.com>
Acked-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/usb/usbip/src/utils.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/usb/usbip/src/utils.c b/tools/usb/usbip/src/utils.c
index 2b3d6d235015..3d7b42e77299 100644
--- a/tools/usb/usbip/src/utils.c
+++ b/tools/usb/usbip/src/utils.c
@@ -30,6 +30,7 @@ int modify_match_busid(char *busid, int add)
 	char command[SYSFS_BUS_ID_SIZE + 4];
 	char match_busid_attr_path[SYSFS_PATH_MAX];
 	int rc;
+	int cmd_size;
 
 	snprintf(match_busid_attr_path, sizeof(match_busid_attr_path),
 		 "%s/%s/%s/%s/%s/%s", SYSFS_MNT_PATH, SYSFS_BUS_NAME,
@@ -37,12 +38,14 @@ int modify_match_busid(char *busid, int add)
 		 attr_name);
 
 	if (add)
-		snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s", busid);
+		cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s",
+				    busid);
 	else
-		snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s", busid);
+		cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s",
+				    busid);
 
 	rc = write_sysfs_attribute(match_busid_attr_path, command,
-				   sizeof(command));
+				   cmd_size);
 	if (rc < 0) {
 		dbg("failed to write match_busid: %s", strerror(errno));
 		return -1;

From b9096d9f15c142574ebebe8fbb137012bb9d99c2 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Tue, 12 Dec 2017 16:11:30 +0100
Subject: [PATCH 071/305] usb: add RESET_RESUME for ELSA MicroLink 56K

This modem needs this quirk to operate. It produces timeouts when
resumed without reset.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
CC: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/quirks.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index a10b346b9777..95812656d9b9 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -149,6 +149,9 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */
 	{ USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM },
 
+	/* ELSA MicroLink 56K */
+	{ USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME },
+
 	/* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */
 	{ USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM },
 

From 7f038d256c723dd390d2fca942919573995f4cfd Mon Sep 17 00:00:00 2001
From: Dmitry Fleytman Dmitry Fleytman <dmitry.fleytman@gmail.com>
Date: Tue, 19 Dec 2017 06:02:04 +0200
Subject: [PATCH 072/305] usb: Add device quirk for Logitech HD Pro Webcam
 C925e

Commit e0429362ab15
("usb: Add device quirk for Logitech HD Pro Webcams C920 and C930e")
introduced quirk to workaround an issue with some Logitech webcams.

There is one more model that has the same issue - C925e, so applying
the same quirk as well.

See aforementioned commit message for detailed explanation of the problem.

Signed-off-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/quirks.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 95812656d9b9..4024926c1d68 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -52,10 +52,11 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* Microsoft LifeCam-VX700 v2.0 */
 	{ USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME },
 
-	/* Logitech HD Pro Webcams C920, C920-C and C930e */
+	/* Logitech HD Pro Webcams C920, C920-C, C925e and C930e */
 	{ USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT },
 	{ USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT },
 	{ USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT },
+	{ USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT },
 
 	/* Logitech ConferenceCam CC3000e */
 	{ USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT },

From a93639090a2743c8e205c1ac25439702702b4ce4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 14 Dec 2017 15:43:43 +1100
Subject: [PATCH 073/305] staging: lustre: lnet: Fix recent breakage from
 list_for_each conversion

Commit 8e55b6fd0660 ("staging: lustre: lnet: replace list_for_each
with list_for_each_entry") was intended to be an idempotent change,
but actually broke the behavior of ksocknal_add_peer() causing mounts to fail.
The fact that it caused an existing "route2 = NULL;" to become
redundant could have been a clue.  The fact that the loop body
set the new loop variable to NULL might also have been a clue

The original code relied on "route2" being NULL if nothing was found.
The new code would always set route2 to a non-NULL value if the list
was empty, and would likely crash if the list was not empty.

Restore correct functionality by using code-flow rather the value of
"route2" to determine whether to use on old route, or to add a new one.

Fixes: 8e55b6fd0660 ("staging: lustre: lnet: replace list_for_each with list_for_each_entry")
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../lustre/lnet/klnds/socklnd/socklnd.c       | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
index 986c2a40d978..8267119ccc8e 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
@@ -487,21 +487,18 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 			      ksocknal_nid2peerlist(id.nid));
 	}
 
-	route2 = NULL;
 	list_for_each_entry(route2, &peer->ksnp_routes, ksnr_list) {
-		if (route2->ksnr_ipaddr == ipaddr)
-			break;
-
-		route2 = NULL;
+		if (route2->ksnr_ipaddr == ipaddr) {
+			/* Route already exists, use the old one */
+			ksocknal_route_decref(route);
+			route2->ksnr_share_count++;
+			goto out;
+		}
 	}
-	if (!route2) {
-		ksocknal_add_route_locked(peer, route);
-		route->ksnr_share_count++;
-	} else {
-		ksocknal_route_decref(route);
-		route2->ksnr_share_count++;
-	}
-
+	/* Route doesn't already exist, add the new one */
+	ksocknal_add_route_locked(peer, route);
+	route->ksnr_share_count++;
+out:
 	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
 	return 0;

From 116d2f7496c51b2e02e8e4ecdd2bdf5fb9d5a641 Mon Sep 17 00:00:00 2001
From: Prateek Sood <prsood@codeaurora.org>
Date: Tue, 19 Dec 2017 12:56:57 +0530
Subject: [PATCH 074/305] cgroup: Fix deadlock in cpu hotplug path

Deadlock during cgroup migration from cpu hotplug path when a task T is
being moved from source to destination cgroup.

kworker/0:0
cpuset_hotplug_workfn()
   cpuset_hotplug_update_tasks()
      hotplug_update_tasks_legacy()
        remove_tasks_in_empty_cpuset()
          cgroup_transfer_tasks() // stuck in iterator loop
            cgroup_migrate()
              cgroup_migrate_add_task()

In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
Task T will not migrate to destination cgroup. css_task_iter_start()
will keep pointing to task T in loop waiting for task T cg_list node
to be removed.

Task T
do_exit()
  exit_signals() // sets PF_EXITING
  exit_task_namespaces()
    switch_task_namespaces()
      free_nsproxy()
        put_mnt_ns()
          drop_collected_mounts()
            namespace_unlock()
              synchronize_rcu()
                _synchronize_rcu_expedited()
                  schedule_work() // on cpu0 low priority worker pool
                  wait_event() // waiting for work item to execute

Task T inserted a work item in the worklist of cpu0 low priority
worker pool. It is waiting for expedited grace period work item
to execute. This work item will only be executed once kworker/0:0
complete execution of cpuset_hotplug_workfn().

kworker/0:0 ==> Task T ==>kworker/0:0

In case of PF_EXITING task being migrated from source to destination
cgroup, migrate next available task in source cgroup.

Signed-off-by: Prateek Sood <prsood@codeaurora.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup-v1.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085daab1a..a2c05d2476ac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	 */
 	do {
 		css_task_iter_start(&from->self, 0, &it);
-		task = css_task_iter_next(&it);
+
+		do {
+			task = css_task_iter_next(&it);
+		} while (task && (task->flags & PF_EXITING));
+
 		if (task)
 			get_task_struct(task);
 		css_task_iter_end(&it);

From f292b9b28097d8fe870336108e91bd95a14294bf Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 15 Dec 2017 19:59:47 -0800
Subject: [PATCH 075/305] staging: ion: Fix ion_cma_heap allocations

In trying to add support for drm_hwcomposer to HiKey,
I've needed to utilize the ION CMA heap, and I've noticed
problems with allocations on newer kernels failing.

It seems back with 204f672255c2 ("ion: Use CMA APIs directly"),
the ion_cma_heap code was modified to use the CMA API, but
kept the arguments as buffer lengths rather then number of pages.

This results in errors as we don't have enough pages in CMA to
satisfy the exaggerated requests.

This patch converts the ion_cma_heap CMA API usage to properly
request pages.

It also fixes a minor issue in the allocation where in the error
path, the cma_release is called with the buffer->size value which
hasn't yet been set.

Cc: Laura Abbott <labbott@redhat.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Benjamin Gaignard <benjamin.gaignard@linaro.org>
Cc: Archit Taneja <architt@codeaurora.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Amit Pundir <amit.pundir@linaro.org>
Fixes: 204f672255c2 ("staging: android: ion: Use CMA APIs directly")
Acked-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/ion/Kconfig        |  2 +-
 drivers/staging/android/ion/ion_cma_heap.c | 15 +++++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/staging/android/ion/Kconfig b/drivers/staging/android/ion/Kconfig
index a517b2d29f1b..8f6494158d3d 100644
--- a/drivers/staging/android/ion/Kconfig
+++ b/drivers/staging/android/ion/Kconfig
@@ -37,7 +37,7 @@ config ION_CHUNK_HEAP
 
 config ION_CMA_HEAP
 	bool "Ion CMA heap support"
-	depends on ION && CMA
+	depends on ION && DMA_CMA
 	help
 	  Choose this option to enable CMA heaps with Ion. This heap is backed
 	  by the Contiguous Memory Allocator (CMA). If your system has these
diff --git a/drivers/staging/android/ion/ion_cma_heap.c b/drivers/staging/android/ion/ion_cma_heap.c
index dd5545d9990a..86196ffd2faf 100644
--- a/drivers/staging/android/ion/ion_cma_heap.c
+++ b/drivers/staging/android/ion/ion_cma_heap.c
@@ -39,9 +39,15 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
 	struct ion_cma_heap *cma_heap = to_cma_heap(heap);
 	struct sg_table *table;
 	struct page *pages;
+	unsigned long size = PAGE_ALIGN(len);
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	unsigned long align = get_order(size);
 	int ret;
 
-	pages = cma_alloc(cma_heap->cma, len, 0, GFP_KERNEL);
+	if (align > CONFIG_CMA_ALIGNMENT)
+		align = CONFIG_CMA_ALIGNMENT;
+
+	pages = cma_alloc(cma_heap->cma, nr_pages, align, GFP_KERNEL);
 	if (!pages)
 		return -ENOMEM;
 
@@ -53,7 +59,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
 	if (ret)
 		goto free_mem;
 
-	sg_set_page(table->sgl, pages, len, 0);
+	sg_set_page(table->sgl, pages, size, 0);
 
 	buffer->priv_virt = pages;
 	buffer->sg_table = table;
@@ -62,7 +68,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
 free_mem:
 	kfree(table);
 err:
-	cma_release(cma_heap->cma, pages, buffer->size);
+	cma_release(cma_heap->cma, pages, nr_pages);
 	return -ENOMEM;
 }
 
@@ -70,9 +76,10 @@ static void ion_cma_free(struct ion_buffer *buffer)
 {
 	struct ion_cma_heap *cma_heap = to_cma_heap(buffer->heap);
 	struct page *pages = buffer->priv_virt;
+	unsigned long nr_pages = PAGE_ALIGN(buffer->size) >> PAGE_SHIFT;
 
 	/* release memory */
-	cma_release(cma_heap->cma, pages, buffer->size);
+	cma_release(cma_heap->cma, pages, nr_pages);
 	/* release sg table */
 	sg_free_table(buffer->sg_table);
 	kfree(buffer->sg_table);

From d6b246bb7a29703f53aa4c050b8b3205d749caee Mon Sep 17 00:00:00 2001
From: Sushmita Susheelendra <ssusheel@codeaurora.org>
Date: Fri, 15 Dec 2017 13:59:13 -0700
Subject: [PATCH 076/305] staging: android: ion: Fix dma direction for
 dma_sync_sg_for_cpu/device

Use the direction argument passed into begin_cpu_access
and end_cpu_access when calling the dma_sync_sg_for_cpu/device.
The actual cache primitive called depends on the direction
passed in.

Signed-off-by: Sushmita Susheelendra <ssusheel@codeaurora.org>
Cc: stable <stable@vger.kernel.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/ion/ion.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index a7d9b0e98572..f480885e346b 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -346,7 +346,7 @@ static int ion_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
 	mutex_lock(&buffer->lock);
 	list_for_each_entry(a, &buffer->attachments, list) {
 		dma_sync_sg_for_cpu(a->dev, a->table->sgl, a->table->nents,
-				    DMA_BIDIRECTIONAL);
+				    direction);
 	}
 	mutex_unlock(&buffer->lock);
 
@@ -368,7 +368,7 @@ static int ion_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
 	mutex_lock(&buffer->lock);
 	list_for_each_entry(a, &buffer->attachments, list) {
 		dma_sync_sg_for_device(a->dev, a->table->sgl, a->table->nents,
-				       DMA_BIDIRECTIONAL);
+				       direction);
 	}
 	mutex_unlock(&buffer->lock);
 

From 74d0833c659a8a54735e5efdd44f4b225af68586 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 20 Dec 2017 07:09:19 -0800
Subject: [PATCH 077/305] cgroup: fix css_task_iter crash on CSS_TASK_ITER_PROC

While teaching css_task_iter to handle skipping over tasks which
aren't group leaders, bc2fb7ed089f ("cgroup: add @flags to
css_task_iter_start() and implement CSS_TASK_ITER_PROCS") introduced a
silly bug.

CSS_TASK_ITER_PROCS is implemented by repeating
css_task_iter_advance() while the advanced cursor is pointing to a
non-leader thread.  However, the cursor variable, @l, wasn't updated
when the iteration has to advance to the next css_set and the
following repetition would operate on the terminal @l from the
previous iteration which isn't pointing to a valid task leading to
oopses like the following or infinite looping.

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000254
  IP: __task_pid_nr_ns+0xc7/0xf0
  PGD 0 P4D 0
  Oops: 0000 [#1] SMP
  ...
  CPU: 2 PID: 1 Comm: systemd Not tainted 4.14.4-200.fc26.x86_64 #1
  Hardware name: System manufacturer System Product Name/PRIME B350M-A, BIOS 3203 11/09/2017
  task: ffff88c4baee8000 task.stack: ffff96d5c3158000
  RIP: 0010:__task_pid_nr_ns+0xc7/0xf0
  RSP: 0018:ffff96d5c315bd50 EFLAGS: 00010206
  RAX: 0000000000000000 RBX: ffff88c4b68c6000 RCX: 0000000000000250
  RDX: ffffffffa5e47960 RSI: 0000000000000000 RDI: ffff88c490f6ab00
  RBP: ffff96d5c315bd50 R08: 0000000000001000 R09: 0000000000000005
  R10: ffff88c4be006b80 R11: ffff88c42f1b8004 R12: ffff96d5c315bf18
  R13: ffff88c42d7dd200 R14: ffff88c490f6a510 R15: ffff88c4b68c6000
  FS:  00007f9446f8ea00(0000) GS:ffff88c4be680000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000254 CR3: 00000007f956f000 CR4: 00000000003406e0
  Call Trace:
   cgroup_procs_show+0x19/0x30
   cgroup_seqfile_show+0x4c/0xb0
   kernfs_seq_show+0x21/0x30
   seq_read+0x2ec/0x3f0
   kernfs_fop_read+0x134/0x180
   __vfs_read+0x37/0x160
   ? security_file_permission+0x9b/0xc0
   vfs_read+0x8e/0x130
   SyS_read+0x55/0xc0
   entry_SYSCALL_64_fastpath+0x1a/0xa5
  RIP: 0033:0x7f94455f942d
  RSP: 002b:00007ffe81ba2d00 EFLAGS: 00000293 ORIG_RAX: 0000000000000000
  RAX: ffffffffffffffda RBX: 00005574e2233f00 RCX: 00007f94455f942d
  RDX: 0000000000001000 RSI: 00005574e2321a90 RDI: 000000000000002b
  RBP: 0000000000000000 R08: 00005574e2321a90 R09: 00005574e231de60
  R10: 00007f94458c8b38 R11: 0000000000000293 R12: 00007f94458c8ae0
  R13: 00007ffe81ba3800 R14: 0000000000000000 R15: 00005574e2116560
  Code: 04 74 0e 89 f6 48 8d 04 76 48 8d 04 c5 f0 05 00 00 48 8b bf b8 05 00 00 48 01 c7 31 c0 48 8b 0f 48 85 c9 74 18 8b b2 30 08 00 00 <3b> 71 04 77 0d 48 c1 e6 05 48 01 f1 48 3b 51 38 74 09 5d c3 8b
  RIP: __task_pid_nr_ns+0xc7/0xf0 RSP: ffff96d5c315bd50

Fix it by moving the initialization of the cursor below the repeat
label.  While at it, rename it to @next for readability.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: bc2fb7ed089f ("cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS")
Cc: stable@vger.kernel.org # v4.14+
Reported-by: Laura Abbott <labbott@redhat.com>
Reported-by: Bronek Kozicki <brok@incorrekt.com>
Reported-by: George Amanakis <gamanakis@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f4c2f8cb5748..2cf06c274e4c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 
 static void css_task_iter_advance(struct css_task_iter *it)
 {
-	struct list_head *l = it->task_pos;
+	struct list_head *next;
 
 	lockdep_assert_held(&css_set_lock);
-	WARN_ON_ONCE(!l);
-
 repeat:
 	/*
 	 * Advance iterator to find next entry.  cset->tasks is consumed
 	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
 	 * next cset.
 	 */
-	l = l->next;
+	next = it->task_pos->next;
 
-	if (l == it->tasks_head)
-		l = it->mg_tasks_head->next;
+	if (next == it->tasks_head)
+		next = it->mg_tasks_head->next;
 
-	if (l == it->mg_tasks_head)
+	if (next == it->mg_tasks_head)
 		css_task_iter_advance_css_set(it);
 	else
-		it->task_pos = l;
+		it->task_pos = next;
 
 	/* if PROCS, skip over tasks which aren't group leaders */
 	if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&

From d0729bc6bee797fb4bcca87583af5adbfe79ecfb Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Mon, 11 Dec 2017 21:50:25 +0900
Subject: [PATCH 078/305] arc: do not use __print_symbol()

__print_symbol() uses extra stack space to sprintf() symbol
information and then to feed that buffer to printk()

  char buffer[KSYM_SYMBOL_LEN];

  sprint_symbol(buffer, address);
  printk(fmt, buffer);

Replace __print_symbol() with a direct printk("%pS") call.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/kernel/stacktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c
index 74315f302971..bf40e06f3fb8 100644
--- a/arch/arc/kernel/stacktrace.c
+++ b/arch/arc/kernel/stacktrace.c
@@ -163,7 +163,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
  */
 static int __print_sym(unsigned int address, void *unused)
 {
-	__print_symbol("  %s\n", address);
+	printk("  %pS\n", (void *)address);
 	return 0;
 }
 

From c18fc9071762769acb4040cabae45c817aefc537 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <Alexey.Brodkin@synopsys.com>
Date: Tue, 5 Dec 2017 13:19:38 +0300
Subject: [PATCH 079/305] ARC: [plat-hsdk] Switch DisplayLink driver from fbdev
 to DRM

Currently there're 2 different implementations of the driver for
DisplayLink USB2.0-to-HDMI/DVI adapters: older FBDEV and modern true
DRM.

We initially decided to use FBDEV version just because with it
/dev/fbX is usable from user-space while in DRM version
with DRM_FBDEV_EMULATION user-space cannot draw anything on a real
screen, for more info read [1].

But today /dev/fbX is not that important as more and more software
projects switch to use of DRI (/dev/dri/cardX).

But what's even more important DRM driver allows building of complicated
graphics processing chains. The most important for us is rendering of
3D on a dedicated GPU while outputting video through a simpler
bitstreamer like DisplayLink. So let's use much more future-proof
driver from now on.

[1] https://lists.freedesktop.org/archives/dri-devel/2017-December/159519.html

Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/configs/hsdk_defconfig | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig
index 7b8f8faf8a24..ac6b0ed8341e 100644
--- a/arch/arc/configs/hsdk_defconfig
+++ b/arch/arc/configs/hsdk_defconfig
@@ -49,10 +49,11 @@ CONFIG_SERIAL_8250_DW=y
 CONFIG_SERIAL_OF_PLATFORM=y
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
+CONFIG_DRM=y
+# CONFIG_DRM_FBDEV_EMULATION is not set
+CONFIG_DRM_UDL=y
 CONFIG_FB=y
-CONFIG_FB_UDL=y
 CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_USB=y
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_EHCI_HCD_PLATFORM=y
 CONFIG_USB_OHCI_HCD=y

From a08c832f277d7a6f9d3b341a5d5df2f5576220d8 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Sat, 9 Dec 2017 16:59:15 +0300
Subject: [PATCH 080/305] ARC: [plat-hsdk]: Set initial core pll output
 frequency

Set initial core pll output frequency specified in device tree to
1GHz. It will be applied at the core pll driver probing.

Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/boot/dts/hsdk.dts | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts
index 8f627c200d60..006aa3de5348 100644
--- a/arch/arc/boot/dts/hsdk.dts
+++ b/arch/arc/boot/dts/hsdk.dts
@@ -114,6 +114,14 @@
 			reg = <0x00 0x10>, <0x14B8 0x4>;
 			#clock-cells = <0>;
 			clocks = <&input_clk>;
+
+			/*
+			 * Set initial core pll output frequency to 1GHz.
+			 * It will be applied at the core pll driver probing
+			 * on early boot.
+			 */
+			assigned-clocks = <&core_clk>;
+			assigned-clock-rates = <1000000000>;
 		};
 
 		serial: serial@5000 {

From 7bde846d0957fb81ac0bf8c4e2cab284a1da34e0 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Sat, 9 Dec 2017 16:59:16 +0300
Subject: [PATCH 081/305] ARC: [plat-hsdk]: Get rid of core pll frequency set
 in platform code

Get rid of core pll frequency set in platform code as we set it via
device tree using 'assigned-clock-rates' property.

Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/plat-hsdk/platform.c | 42 -----------------------------------
 1 file changed, 42 deletions(-)

diff --git a/arch/arc/plat-hsdk/platform.c b/arch/arc/plat-hsdk/platform.c
index fd0ae5e38639..2958aedb649a 100644
--- a/arch/arc/plat-hsdk/platform.c
+++ b/arch/arc/plat-hsdk/platform.c
@@ -38,42 +38,6 @@ static void __init hsdk_init_per_cpu(unsigned int cpu)
 #define CREG_PAE		(CREG_BASE + 0x180)
 #define CREG_PAE_UPDATE		(CREG_BASE + 0x194)
 
-#define CREG_CORE_IF_CLK_DIV	(CREG_BASE + 0x4B8)
-#define CREG_CORE_IF_CLK_DIV_2	0x1
-#define CGU_BASE		ARC_PERIPHERAL_BASE
-#define CGU_PLL_STATUS		(ARC_PERIPHERAL_BASE + 0x4)
-#define CGU_PLL_CTRL		(ARC_PERIPHERAL_BASE + 0x0)
-#define CGU_PLL_STATUS_LOCK	BIT(0)
-#define CGU_PLL_STATUS_ERR	BIT(1)
-#define CGU_PLL_CTRL_1GHZ	0x3A10
-#define HSDK_PLL_LOCK_TIMEOUT	500
-
-#define HSDK_PLL_LOCKED() \
-	!!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_LOCK)
-
-#define HSDK_PLL_ERR() \
-	!!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_ERR)
-
-static void __init hsdk_set_cpu_freq_1ghz(void)
-{
-	u32 timeout = HSDK_PLL_LOCK_TIMEOUT;
-
-	/*
-	 * As we set cpu clock which exceeds 500MHz, the divider for the interface
-	 * clock must be programmed to div-by-2.
-	 */
-	iowrite32(CREG_CORE_IF_CLK_DIV_2, (void __iomem *) CREG_CORE_IF_CLK_DIV);
-
-	/* Set cpu clock to 1GHz */
-	iowrite32(CGU_PLL_CTRL_1GHZ, (void __iomem *) CGU_PLL_CTRL);
-
-	while (!HSDK_PLL_LOCKED() && timeout--)
-		cpu_relax();
-
-	if (!HSDK_PLL_LOCKED() || HSDK_PLL_ERR())
-		pr_err("Failed to setup CPU frequency to 1GHz!");
-}
-
 #define SDIO_BASE		(ARC_PERIPHERAL_BASE + 0xA000)
 #define SDIO_UHS_REG_EXT	(SDIO_BASE + 0x108)
 #define SDIO_UHS_REG_EXT_DIV_2	(2 << 30)
@@ -98,12 +62,6 @@ static void __init hsdk_init_early(void)
 	 * minimum possible div-by-2.
 	 */
 	iowrite32(SDIO_UHS_REG_EXT_DIV_2, (void __iomem *) SDIO_UHS_REG_EXT);
-
-	/*
-	 * Setup CPU frequency to 1GHz.
-	 * TODO: remove it after smart hsdk pll driver will be introduced.
-	 */
-	hsdk_set_cpu_freq_1ghz();
 }
 
 static const char *hsdk_compat[] __initconst = {

From fbd1cec57064aa1380726ec899c49fcd84e702b9 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Sat, 9 Dec 2017 16:59:17 +0300
Subject: [PATCH 082/305] ARC: [plat-axs103]: Set initial core pll output
 frequency

Set initial core pll output frequency specified in device tree to
100MHz for SMP configuration and 90MHz for UP configuration.
It will be applied at the core pll driver probing.

Update platform quirk for decreasing core frequency for quad core
configuration.

Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/boot/dts/axc003.dtsi     | 8 ++++++++
 arch/arc/boot/dts/axc003_idu.dtsi | 8 ++++++++
 arch/arc/plat-axs10x/axs10x.c     | 8 ++------
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi
index 4e6e9f57e790..dc91c663bcc0 100644
--- a/arch/arc/boot/dts/axc003.dtsi
+++ b/arch/arc/boot/dts/axc003.dtsi
@@ -35,6 +35,14 @@
 			reg = <0x80 0x10>, <0x100 0x10>;
 			#clock-cells = <0>;
 			clocks = <&input_clk>;
+
+			/*
+			 * Set initial core pll output frequency to 90MHz.
+			 * It will be applied at the core pll driver probing
+			 * on early boot.
+			 */
+			assigned-clocks = <&core_clk>;
+			assigned-clock-rates = <90000000>;
 		};
 
 		core_intc: archs-intc@cpu {
diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi
index 63954a8b0100..69ff4895f2ba 100644
--- a/arch/arc/boot/dts/axc003_idu.dtsi
+++ b/arch/arc/boot/dts/axc003_idu.dtsi
@@ -35,6 +35,14 @@
 			reg = <0x80 0x10>, <0x100 0x10>;
 			#clock-cells = <0>;
 			clocks = <&input_clk>;
+
+			/*
+			 * Set initial core pll output frequency to 100MHz.
+			 * It will be applied at the core pll driver probing
+			 * on early boot.
+			 */
+			assigned-clocks = <&core_clk>;
+			assigned-clock-rates = <100000000>;
 		};
 
 		core_intc: archs-intc@cpu {
diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c
index f1ac6790da5f..ac1a712f6f1f 100644
--- a/arch/arc/plat-axs10x/axs10x.c
+++ b/arch/arc/plat-axs10x/axs10x.c
@@ -320,22 +320,18 @@ static void __init axs103_early_init(void)
 	unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F;
 	if (num_cores > 2) {
 		u32 freq = 50, orig;
-		/*
-		 * TODO: use cpu node "cpu-freq" param instead of platform-specific
-		 * "/cpu_card/core_clk" as it works only if we use fixed-clock for cpu.
-		 */
 		int off = fdt_path_offset(initial_boot_params, "/cpu_card/core_clk");
 		const struct fdt_property *prop;
 
 		prop = fdt_get_property(initial_boot_params, off,
-					"clock-frequency", NULL);
+					"assigned-clock-rates", NULL);
 		orig = be32_to_cpu(*(u32*)(prop->data)) / 1000000;
 
 		/* Patching .dtb in-place with new core clock value */
 		if (freq != orig ) {
 			freq = cpu_to_be32(freq * 1000000);
 			fdt_setprop_inplace(initial_boot_params, off,
-					    "clock-frequency", &freq, sizeof(freq));
+					    "assigned-clock-rates", &freq, sizeof(freq));
 		}
 	}
 #endif

From d7de73b586b2db540187ff8a077330fa1a8efd64 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Sat, 9 Dec 2017 16:59:18 +0300
Subject: [PATCH 083/305] ARC: [plat-axs103] refactor the quad core DT quirk
 code

Refactor the quad core DT quirk code:
get rid of waste division and multiplication by 1000000 constant.

Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/plat-axs10x/axs10x.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c
index ac1a712f6f1f..46544e88492d 100644
--- a/arch/arc/plat-axs10x/axs10x.c
+++ b/arch/arc/plat-axs10x/axs10x.c
@@ -317,19 +317,21 @@ static void __init axs103_early_init(void)
 	 * Instead of duplicating defconfig/DT for SMP/QUAD, add a small hack
 	 * of fudging the freq in DT
 	 */
+#define AXS103_QUAD_CORE_CPU_FREQ_HZ	50000000
+
 	unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F;
 	if (num_cores > 2) {
-		u32 freq = 50, orig;
+		u32 freq;
 		int off = fdt_path_offset(initial_boot_params, "/cpu_card/core_clk");
 		const struct fdt_property *prop;
 
 		prop = fdt_get_property(initial_boot_params, off,
 					"assigned-clock-rates", NULL);
-		orig = be32_to_cpu(*(u32*)(prop->data)) / 1000000;
+		freq = be32_to_cpu(*(u32 *)(prop->data));
 
 		/* Patching .dtb in-place with new core clock value */
-		if (freq != orig ) {
-			freq = cpu_to_be32(freq * 1000000);
+		if (freq != AXS103_QUAD_CORE_CPU_FREQ_HZ) {
+			freq = cpu_to_be32(AXS103_QUAD_CORE_CPU_FREQ_HZ);
 			fdt_setprop_inplace(initial_boot_params, off,
 					    "assigned-clock-rates", &freq, sizeof(freq));
 		}

From 79435ac78d160e4c245544d457850a56f805ac0d Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Fri, 8 Dec 2017 08:26:58 -0800
Subject: [PATCH 084/305] ARC: uaccess: dont use "l" gcc inline asm constraint
 modifier

This used to setup the LP_COUNT register automatically, but now has been
removed.

There was an earlier fix 3c7c7a2fc8811 which fixed instance in delay.h but
somehow missed this one as gcc change had not made its way into
production toolchains and was not pedantic as it is now !

Cc: stable@vger.kernel.org
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/include/asm/uaccess.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index f35974ee7264..c9173c02081c 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -668,6 +668,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
 		return 0;
 
 	__asm__ __volatile__(
+	"	mov	lp_count, %5		\n"
 	"	lp	3f			\n"
 	"1:	ldb.ab  %3, [%2, 1]		\n"
 	"	breq.d	%3, 0, 3f               \n"
@@ -684,8 +685,8 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
 	"	.word   1b, 4b			\n"
 	"	.previous			\n"
 	: "+r"(res), "+r"(dst), "+r"(src), "=r"(val)
-	: "g"(-EFAULT), "l"(count)
-	: "memory");
+	: "g"(-EFAULT), "r"(count)
+	: "lp_count", "lp_start", "lp_end", "memory");
 
 	return res;
 }

From 24c0df82ef7919e4d10cf2e4e65d368eb2e8ea21 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 12:01:21 +0100
Subject: [PATCH 085/305] netfilter: nf_tables: fix chain filter in
 nf_tables_dump_rules()

ctx->chain may be null now that we have very large object names,
so we cannot check for ctx->chain[0] here.

Fixes: b7263e071aba7 ("netfilter: nf_tables: Allow table names of up to 255 chars")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Phil Sutter <phil@nwl.cc>
---
 net/netfilter/nf_tables_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 10798b357481..8d4526651661 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2072,7 +2072,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 				continue;
 
 			list_for_each_entry_rcu(chain, &table->chains, list) {
-				if (ctx && ctx->chain[0] &&
+				if (ctx && ctx->chain &&
 				    strcmp(ctx->chain, chain->name) != 0)
 					continue;
 

From f5a16b93e6291ba1f65f55647cb4cd8d75ed1b35 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Wed, 20 Dec 2017 12:37:54 -0800
Subject: [PATCH 086/305] ARC: handle gcc generated __builtin_trap()

gcc toggle -fisolate-erroneous-paths-dereference (default at -O2
onwards) isolates faulty code paths such as null pointer access, divide
by zero etc by emitting __builtin_trap()

Newer ARC gcc generates TRAP_S 5 instruction which needs to be handled
and treated like any other unexpected exception
  - user mode  : task terminated with a SEGV
  - kernel mode: die() called after register and stack dump

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/kernel/traps.c        | 6 ++++++
 arch/arc/kernel/troubleshoot.c | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c
index bcd7c9fc5d0f..004f4e4a4c10 100644
--- a/arch/arc/kernel/traps.c
+++ b/arch/arc/kernel/traps.c
@@ -83,6 +83,7 @@ DO_ERROR_INFO(SIGILL, "Illegal Insn (or Seq)", insterror_is_error, ILL_ILLOPC)
 DO_ERROR_INFO(SIGBUS, "Invalid Mem Access", __weak do_memory_error, BUS_ADRERR)
 DO_ERROR_INFO(SIGTRAP, "Breakpoint Set", trap_is_brkpt, TRAP_BRKPT)
 DO_ERROR_INFO(SIGBUS, "Misaligned Access", do_misaligned_error, BUS_ADRALN)
+DO_ERROR_INFO(SIGSEGV, "gcc generated __builtin_trap", do_trap5_error, 0)
 
 /*
  * Entry Point for Misaligned Data access Exception, for emulating in software
@@ -115,6 +116,8 @@ void do_machine_check_fault(unsigned long address, struct pt_regs *regs)
  * Thus TRAP_S <n> can be used for specific purpose
  *  -1 used for software breakpointing (gdb)
  *  -2 used by kprobes
+ *  -5 __builtin_trap() generated by gcc (2018.03 onwards) for toggle such as
+ *     -fno-isolate-erroneous-paths-dereference
  */
 void do_non_swi_trap(unsigned long address, struct pt_regs *regs)
 {
@@ -134,6 +137,9 @@ void do_non_swi_trap(unsigned long address, struct pt_regs *regs)
 		kgdb_trap(regs);
 		break;
 
+	case 5:
+		do_trap5_error(address, regs);
+		break;
 	default:
 		break;
 	}
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 7d8c1d6c2f60..6e9a0a9a6a04 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -163,6 +163,9 @@ static void show_ecr_verbose(struct pt_regs *regs)
 		else
 			pr_cont("Bus Error, check PRM\n");
 #endif
+	} else if (vec == ECR_V_TRAP) {
+		if (regs->ecr_param == 5)
+			pr_cont("gcc generated __builtin_trap\n");
 	} else {
 		pr_cont("Check Programmer's Manual\n");
 	}

From d1b8b2391c24751e44f618fcf86fb55d9a9247fd Mon Sep 17 00:00:00 2001
From: Cathy Avery <cavery@redhat.com>
Date: Tue, 19 Dec 2017 13:32:48 -0500
Subject: [PATCH 087/305] scsi: storvsc: Fix scsi_cmd error assignments in
 storvsc_handle_error

When an I/O is returned with an srb_status of SRB_STATUS_INVALID_LUN
which has zero good_bytes it must be assigned an error. Otherwise the
I/O will be continuously requeued and will cause a deadlock in the case
where disks are being hot added and removed. sd_probe_async will wait
forever for its I/O to complete while holding scsi_sd_probe_domain.

Also returning the default error of DID_TARGET_FAILURE causes multipath
to not retry the I/O resulting in applications receiving I/O errors
before a failover can occur.

Signed-off-by: Cathy Avery <cavery@redhat.com>
Signed-off-by: Long Li <longli@microsoft.com>
Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/storvsc_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 1b06cf0375dc..3b3d1d050cac 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -953,10 +953,11 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb,
 		case TEST_UNIT_READY:
 			break;
 		default:
-			set_host_byte(scmnd, DID_TARGET_FAILURE);
+			set_host_byte(scmnd, DID_ERROR);
 		}
 		break;
 	case SRB_STATUS_INVALID_LUN:
+		set_host_byte(scmnd, DID_NO_CONNECT);
 		do_work = true;
 		process_err_fn = storvsc_remove_lun;
 		break;

From 4c82fd0abb87e20d0d68ef5237e74732352806c8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 20 Dec 2017 12:08:33 +0100
Subject: [PATCH 088/305] netfilter: uapi: correct UNTRACKED conntrack state
 bit number

nft_ct exposes this bit to userspace.  This used to be

  #define NF_CT_STATE_UNTRACKED_BIT              (1 << (IP_CT_NUMBER + 1))
  (IP_CT_NUMBER is 5, so this was 0x40)

.. but this got changed to 8 (0x100) when the untracked object got removed.
Replace this with a literal 6 to prevent further incompatible changes
in case IP_CT_NUMBER ever increases.

Fixes: cc41c84b7e7f2 ("netfilter: kill the fake untracked conntrack objects")
Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 3fea7709a441..57ccfb32e87f 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -36,7 +36,7 @@ enum ip_conntrack_info {
 
 #define NF_CT_STATE_INVALID_BIT			(1 << 0)
 #define NF_CT_STATE_BIT(ctinfo)			(1 << ((ctinfo) % IP_CT_IS_REPLY + 1))
-#define NF_CT_STATE_UNTRACKED_BIT		(1 << (IP_CT_UNTRACKED + 1))
+#define NF_CT_STATE_UNTRACKED_BIT		(1 << 6)
 
 /* Bitset representing status of connection. */
 enum ip_conntrack_status {

From 9b3fa47d4a76b1d606a396455f9bbeee083ef008 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 13 Dec 2017 15:21:22 -0800
Subject: [PATCH 089/305] kobject: fix suppressing modalias in uevents
 delivered over netlink

The commit 4a336a23d619 ("kobject: copy env blob in one go") optimized
constructing uevent data for delivery over netlink by using the raw
environment buffer, instead of reconstructing it from individual
environment pointers. Unfortunately in doing so it broke suppressing
MODALIAS attribute for KOBJ_UNBIND events, as the code that suppressed this
attribute only adjusted the environment pointers, but left the buffer
itself alone. Let's fix it by making sure the offending attribute is
obliterated form the buffer as well.

Reported-by: Tariq Toukan <tariqt@mellanox.com>
Reported-by: Casey Leedom <leedom@chelsio.com>
Fixes: 4a336a23d619 ("kobject: copy env blob in one go")
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 lib/kobject_uevent.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index c3e84edc47c9..2615074d3de5 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -346,7 +346,8 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
 static void zap_modalias_env(struct kobj_uevent_env *env)
 {
 	static const char modalias_prefix[] = "MODALIAS=";
-	int i;
+	size_t len;
+	int i, j;
 
 	for (i = 0; i < env->envp_idx;) {
 		if (strncmp(env->envp[i], modalias_prefix,
@@ -355,11 +356,18 @@ static void zap_modalias_env(struct kobj_uevent_env *env)
 			continue;
 		}
 
-		if (i != env->envp_idx - 1)
-			memmove(&env->envp[i], &env->envp[i + 1],
-				sizeof(env->envp[i]) * env->envp_idx - 1);
+		len = strlen(env->envp[i]) + 1;
+
+		if (i != env->envp_idx - 1) {
+			memmove(env->envp[i], env->envp[i + 1],
+				env->buflen - len);
+
+			for (j = i; j < env->envp_idx - 1; j++)
+				env->envp[j] = env->envp[j + 1] - len;
+		}
 
 		env->envp_idx--;
+		env->buflen -= len;
 	}
 }
 

From 966031f340185eddd05affcf72b740549f056348 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 20 Dec 2017 17:57:06 -0800
Subject: [PATCH 090/305] n_tty: fix EXTPROC vs ICANON interaction with TIOCINQ
 (aka FIONREAD)

We added support for EXTPROC back in 2010 in commit 26df6d13406d ("tty:
Add EXTPROC support for LINEMODE") and the intent was to allow it to
override some (all?) ICANON behavior.  Quoting from that original commit
message:

         There is a new bit in the termios local flag word, EXTPROC.
         When this bit is set, several aspects of the terminal driver
         are disabled.  Input line editing, character echo, and mapping
         of signals are all disabled.  This allows the telnetd to turn
         off these functions when in linemode, but still keep track of
         what state the user wants the terminal to be in.

but the problem turns out that "several aspects of the terminal driver
are disabled" is a bit ambiguous, and you can really confuse the n_tty
layer by setting EXTPROC and then causing some of the ICANON invariants
to no longer be maintained.

This fixes at least one such case (TIOCINQ) becoming unhappy because of
the confusion over whether ICANON really means ICANON when EXTPROC is set.

This basically makes TIOCINQ match the case of read: if EXTPROC is set,
we ignore ICANON.  Also, make sure to reset the ICANON state ie EXTPROC
changes, not just if ICANON changes.

Fixes: 26df6d13406d ("tty: Add EXTPROC support for LINEMODE")
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Reported-by: syzkaller <syzkaller@googlegroups.com>
Cc: Jiri Slaby <jslaby@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 427e0d5d8f13..539b49adb6af 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1762,7 +1762,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 
-	if (!old || (old->c_lflag ^ tty->termios.c_lflag) & ICANON) {
+	if (!old || (old->c_lflag ^ tty->termios.c_lflag) & (ICANON | EXTPROC)) {
 		bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE);
 		ldata->line_start = ldata->read_tail;
 		if (!L_ICANON(tty) || !read_cnt(ldata)) {
@@ -2425,7 +2425,7 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file,
 		return put_user(tty_chars_in_buffer(tty), (int __user *) arg);
 	case TIOCINQ:
 		down_write(&tty->termios_rwsem);
-		if (L_ICANON(tty))
+		if (L_ICANON(tty) && !L_EXTPROC(tty))
 			retval = inq_canon(ldata);
 		else
 			retval = read_cnt(ldata);

From 976a9b35d77a9d297cb03154aa61a6214a213b5e Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Wed, 20 Dec 2017 18:17:29 +0100
Subject: [PATCH 091/305] ARM: dts: exynos: Enable Mixer node for Exynos5800
 Peach Pi machine

Commit 1cb686c08d12 ("ARM: dts: exynos: Add status property to Exynos 542x
Mixer nodes") disabled the Mixer node by default in the DTSI and enabled
for each Exynos 542x DTS. But unfortunately it missed to enable it for the
Exynos5800 Peach Pi machine, since the 5800 is also an 542x SoC variant.

Fixes: 1cb686c08d12 ("ARM: dts: exynos: Add status property to Exynos 542x Mixer nodes")
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Guillaume Tucker <guillaume.tucker@collabora.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/exynos5800-peach-pi.dts | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm/boot/dts/exynos5800-peach-pi.dts b/arch/arm/boot/dts/exynos5800-peach-pi.dts
index b2b95ff205e8..0029ec27819c 100644
--- a/arch/arm/boot/dts/exynos5800-peach-pi.dts
+++ b/arch/arm/boot/dts/exynos5800-peach-pi.dts
@@ -664,6 +664,10 @@
 	status = "okay";
 };
 
+&mixer {
+	status = "okay";
+};
+
 /* eMMC flash */
 &mmc_0 {
 	status = "okay";

From d2271826e58b83f9a75634a3f4334082ecf0a02e Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Fri, 15 Dec 2017 16:03:32 +1030
Subject: [PATCH 092/305] ARM: dts: aspeed-g4: Correct VUART IRQ number
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This should have always been 8.

Fixes: db4d6d9d80fa ("ARM: dts: aspeed: Correctly order UART nodes")
Cc: stable@vger.kernel.org
Signed-off-by: Joel Stanley <joel@jms.id.au>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/aspeed-g4.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/aspeed-g4.dtsi b/arch/arm/boot/dts/aspeed-g4.dtsi
index 45d815a86d42..de08d9045cb8 100644
--- a/arch/arm/boot/dts/aspeed-g4.dtsi
+++ b/arch/arm/boot/dts/aspeed-g4.dtsi
@@ -219,7 +219,7 @@
 				compatible = "aspeed,ast2400-vuart";
 				reg = <0x1e787000 0x40>;
 				reg-shift = <2>;
-				interrupts = <10>;
+				interrupts = <8>;
 				clocks = <&clk_uart>;
 				no-loopback-test;
 				status = "disabled";

From 506e8a912661c97b41adc8a286b875d01323ec45 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 21 Dec 2017 22:35:19 +0100
Subject: [PATCH 093/305] ARM: dts: ls1021a: fix incorrect clock references

dtc warns about two 'clocks' properties that have an extraneous '1'
at the end:

arch/arm/boot/dts/ls1021a-qds.dtb: Warning (clocks_property): arch/arm/boot/dts/ls1021a-twr.dtb: Warning (clocks_property): Property 'clocks', cell 1 is not a phandle reference in /soc/i2c@2180000/mux@77/i2c@4/sgtl5000@2a
arch/arm/boot/dts/ls1021a-qds.dtb: Warning (clocks_property): Missing property '#clock-cells' in node /soc/interrupt-controller@1400000 or bad phandle (referred from /soc/i2c@2180000/mux@77/i2c@4/sgtl5000@2a:clocks[1])
Property 'clocks', cell 1 is not a phandle reference in /soc/i2c@2190000/sgtl5000@a
arch/arm/boot/dts/ls1021a-twr.dtb: Warning (clocks_property): Missing property '#clock-cells' in node /soc/interrupt-controller@1400000 or bad phandle (referred from /soc/i2c@2190000/sgtl5000@a:clocks[1])

The clocks that get referenced here are fixed-rate, so they do not
take any argument, and dtc interprets the next cell as a phandle, which
is invalid.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/ls1021a-qds.dts | 2 +-
 arch/arm/boot/dts/ls1021a-twr.dts | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/ls1021a-qds.dts b/arch/arm/boot/dts/ls1021a-qds.dts
index 940875316d0f..67b4de0e3439 100644
--- a/arch/arm/boot/dts/ls1021a-qds.dts
+++ b/arch/arm/boot/dts/ls1021a-qds.dts
@@ -215,7 +215,7 @@
 				reg = <0x2a>;
 				VDDA-supply = <&reg_3p3v>;
 				VDDIO-supply = <&reg_3p3v>;
-				clocks = <&sys_mclk 1>;
+				clocks = <&sys_mclk>;
 			};
 		};
 	};
diff --git a/arch/arm/boot/dts/ls1021a-twr.dts b/arch/arm/boot/dts/ls1021a-twr.dts
index a8b148ad1dd2..44715c8ef756 100644
--- a/arch/arm/boot/dts/ls1021a-twr.dts
+++ b/arch/arm/boot/dts/ls1021a-twr.dts
@@ -187,7 +187,7 @@
 		reg = <0x0a>;
 		VDDA-supply = <&reg_3p3v>;
 		VDDIO-supply = <&reg_3p3v>;
-		clocks = <&sys_mclk 1>;
+		clocks = <&sys_mclk>;
 	};
 };
 

From fbd90b4cae105fbd8364fa1ce3f41d0c06296f58 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 21 Dec 2017 22:45:24 +0100
Subject: [PATCH 094/305] ARM: dts: tango4: remove bogus interrupt-controller
 property

dtc points out that the parent node of the interrupt controllers is not
actually an interrupt controller itself, and lacks an #interrupt-cells
property:

arch/arm/boot/dts/tango4-vantage-1172.dtb: Warning (interrupts_property): Missing #interrupt-cells in interrupt-parent /soc/interrupt-controller@6e000

This removes the annotation.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/tango4-common.dtsi | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm/boot/dts/tango4-common.dtsi b/arch/arm/boot/dts/tango4-common.dtsi
index 0ec1b0a317b4..ff72a8efb73d 100644
--- a/arch/arm/boot/dts/tango4-common.dtsi
+++ b/arch/arm/boot/dts/tango4-common.dtsi
@@ -156,7 +156,6 @@
 			reg = <0x6e000 0x400>;
 			ranges = <0 0x6e000 0x400>;
 			interrupt-parent = <&gic>;
-			interrupt-controller;
 			#address-cells = <1>;
 			#size-cells = <1>;
 

From d042566d8c704e1ecec370300545d4a409222e39 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 5 Dec 2017 11:10:26 +0100
Subject: [PATCH 095/305] crypto: chelsio - select CRYPTO_GF128MUL

Without the gf128mul library support, we can run into a link
error:

drivers/crypto/chelsio/chcr_algo.o: In function `chcr_update_tweak':
chcr_algo.c:(.text+0x7e0): undefined reference to `gf128mul_x8_ble'

This adds a Kconfig select statement for it, next to the ones we
already have.

Cc: <stable@vger.kernel.org>
Fixes: b8fd1f4170e7 ("crypto: chcr - Add ctr mode and process large sg entries for cipher")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/chelsio/Kconfig b/drivers/crypto/chelsio/Kconfig
index 3e104f5aa0c2..b56b3f711d94 100644
--- a/drivers/crypto/chelsio/Kconfig
+++ b/drivers/crypto/chelsio/Kconfig
@@ -5,6 +5,7 @@ config CRYPTO_DEV_CHELSIO
 	select CRYPTO_SHA256
 	select CRYPTO_SHA512
 	select CRYPTO_AUTHENC
+	select CRYPTO_GF128MUL
 	---help---
 	  The Chelsio Crypto Co-processor driver for T6 adapters.
 

From e57121d08c38dabec15cf3e1e2ad46721af30cae Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 11 Dec 2017 12:15:17 -0800
Subject: [PATCH 096/305] crypto: chacha20poly1305 - validate the digest size

If the rfc7539 template was instantiated with a hash algorithm with
digest size larger than 16 bytes (POLY1305_DIGEST_SIZE), then the digest
overran the 'tag' buffer in 'struct chachapoly_req_ctx', corrupting the
subsequent memory, including 'cryptlen'.  This caused a crash during
crypto_skcipher_decrypt().

Fix it by, when instantiating the template, requiring that the
underlying hash algorithm has the digest size expected for Poly1305.

Reproducer:

    #include <linux/if_alg.h>
    #include <sys/socket.h>
    #include <unistd.h>

    int main()
    {
            int algfd, reqfd;
            struct sockaddr_alg addr = {
                    .salg_type = "aead",
                    .salg_name = "rfc7539(chacha20,sha256)",
            };
            unsigned char buf[32] = { 0 };

            algfd = socket(AF_ALG, SOCK_SEQPACKET, 0);
            bind(algfd, (void *)&addr, sizeof(addr));
            setsockopt(algfd, SOL_ALG, ALG_SET_KEY, buf, sizeof(buf));
            reqfd = accept(algfd, 0, 0);
            write(reqfd, buf, 16);
            read(reqfd, buf, 16);
    }

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 71ebc4d1b27d ("crypto: chacha20poly1305 - Add a ChaCha20-Poly1305 AEAD construction, RFC7539")
Cc: <stable@vger.kernel.org> # v4.2+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/chacha20poly1305.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index db1bc3147bc4..600afa99941f 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -610,6 +610,11 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
 						    algt->mask));
 	if (IS_ERR(poly))
 		return PTR_ERR(poly);
+	poly_hash = __crypto_hash_alg_common(poly);
+
+	err = -EINVAL;
+	if (poly_hash->digestsize != POLY1305_DIGEST_SIZE)
+		goto out_put_poly;
 
 	err = -ENOMEM;
 	inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
@@ -618,7 +623,6 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
 
 	ctx = aead_instance_ctx(inst);
 	ctx->saltlen = CHACHAPOLY_IV_SIZE - ivsize;
-	poly_hash = __crypto_hash_alg_common(poly);
 	err = crypto_init_ahash_spawn(&ctx->poly, poly_hash,
 				      aead_crypto_instance(inst));
 	if (err)

From af955bf15d2c27496b0269b1f05c26f758c68314 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Tue, 19 Dec 2017 10:27:24 +0000
Subject: [PATCH 097/305] crypto: af_alg - Fix race around ctx->rcvused by
 making it atomic_t

This variable was increased and decreased without any protection.
Result was an occasional misscount and negative wrap around resulting
in false resource allocation failures.

Fixes: 7d2c3f54e6f6 ("crypto: af_alg - remove locking in async callback")
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/af_alg.c         | 4 ++--
 crypto/algif_aead.c     | 2 +-
 crypto/algif_skcipher.c | 2 +-
 include/crypto/if_alg.h | 5 +++--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index f1a2caf1b59b..d3f1c431724b 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -664,7 +664,7 @@ void af_alg_free_areq_sgls(struct af_alg_async_req *areq)
 	unsigned int i;
 
 	list_for_each_entry_safe(rsgl, tmp, &areq->rsgl_list, list) {
-		ctx->rcvused -= rsgl->sg_num_bytes;
+		atomic_sub(rsgl->sg_num_bytes, &ctx->rcvused);
 		af_alg_free_sg(&rsgl->sgl);
 		list_del(&rsgl->list);
 		if (rsgl != &areq->first_rsgl)
@@ -1162,7 +1162,7 @@ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
 
 		areq->last_rsgl = rsgl;
 		len += err;
-		ctx->rcvused += err;
+		atomic_add(err, &ctx->rcvused);
 		rsgl->sg_num_bytes = err;
 		iov_iter_advance(&msg->msg_iter, err);
 	}
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index b73db2b27656..20df8c1b6851 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -571,7 +571,7 @@ static int aead_accept_parent_nokey(void *private, struct sock *sk)
 	INIT_LIST_HEAD(&ctx->tsgl_list);
 	ctx->len = len;
 	ctx->used = 0;
-	ctx->rcvused = 0;
+	atomic_set(&ctx->rcvused, 0);
 	ctx->more = 0;
 	ctx->merge = 0;
 	ctx->enc = 0;
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index baef9bfccdda..c5c47b680152 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -390,7 +390,7 @@ static int skcipher_accept_parent_nokey(void *private, struct sock *sk)
 	INIT_LIST_HEAD(&ctx->tsgl_list);
 	ctx->len = len;
 	ctx->used = 0;
-	ctx->rcvused = 0;
+	atomic_set(&ctx->rcvused, 0);
 	ctx->more = 0;
 	ctx->merge = 0;
 	ctx->enc = 0;
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index 38d9c5861ed8..f38227a78eae 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -18,6 +18,7 @@
 #include <linux/if_alg.h>
 #include <linux/scatterlist.h>
 #include <linux/types.h>
+#include <linux/atomic.h>
 #include <net/sock.h>
 
 #include <crypto/aead.h>
@@ -150,7 +151,7 @@ struct af_alg_ctx {
 	struct crypto_wait wait;
 
 	size_t used;
-	size_t rcvused;
+	atomic_t rcvused;
 
 	bool more;
 	bool merge;
@@ -215,7 +216,7 @@ static inline int af_alg_rcvbuf(struct sock *sk)
 	struct af_alg_ctx *ctx = ask->private;
 
 	return max_t(int, max_t(int, sk->sk_rcvbuf & PAGE_MASK, PAGE_SIZE) -
-			  ctx->rcvused, 0);
+		     atomic_read(&ctx->rcvused), 0);
 }
 
 /**

From 203f45003a3d03eea8fa28d74cfc74c354416fdb Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@inai.de>
Date: Tue, 19 Dec 2017 19:09:07 +0100
Subject: [PATCH 098/305] crypto: n2 - cure use after free

queue_cache_init is first called for the Control Word Queue
(n2_crypto_probe). At that time, queue_cache[0] is NULL and a new
kmem_cache will be allocated. If the subsequent n2_register_algs call
fails, the kmem_cache will be released in queue_cache_destroy, but
queue_cache_init[0] is not set back to NULL.

So when the Module Arithmetic Unit gets probed next (n2_mau_probe),
queue_cache_init will not allocate a kmem_cache again, but leave it
as its bogus value, causing a BUG() to trigger when queue_cache[0] is
eventually passed to kmem_cache_zalloc:

	n2_crypto: Found N2CP at /virtual-devices@100/n2cp@7
	n2_crypto: Registered NCS HVAPI version 2.0
	called queue_cache_init
	n2_crypto: md5 alg registration failed
	n2cp f028687c: /virtual-devices@100/n2cp@7: Unable to register algorithms.
	called queue_cache_destroy
	n2cp: probe of f028687c failed with error -22
	n2_crypto: Found NCP at /virtual-devices@100/ncp@6
	n2_crypto: Registered NCS HVAPI version 2.0
	called queue_cache_init
	kernel BUG at mm/slab.c:2993!
	Call Trace:
	 [0000000000604488] kmem_cache_alloc+0x1a8/0x1e0
                  (inlined) kmem_cache_zalloc
                  (inlined) new_queue
                  (inlined) spu_queue_setup
                  (inlined) handle_exec_unit
	 [0000000010c61eb4] spu_mdesc_scan+0x1f4/0x460 [n2_crypto]
	 [0000000010c62b80] n2_mau_probe+0x100/0x220 [n2_crypto]
	 [000000000084b174] platform_drv_probe+0x34/0xc0

Cc: <stable@vger.kernel.org>
Signed-off-by: Jan Engelhardt <jengelh@inai.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/n2_core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index 48de52cf2ecc..662e709812cc 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -1625,6 +1625,7 @@ static int queue_cache_init(void)
 					  CWQ_ENTRY_SIZE, 0, NULL);
 	if (!queue_cache[HV_NCS_QTYPE_CWQ - 1]) {
 		kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_MAU - 1]);
+		queue_cache[HV_NCS_QTYPE_MAU - 1] = NULL;
 		return -ENOMEM;
 	}
 	return 0;
@@ -1634,6 +1635,8 @@ static void queue_cache_destroy(void)
 {
 	kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_MAU - 1]);
 	kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]);
+	queue_cache[HV_NCS_QTYPE_MAU - 1] = NULL;
+	queue_cache[HV_NCS_QTYPE_CWQ - 1] = NULL;
 }
 
 static long spu_queue_register_workfn(void *arg)

From d76c68109f37cb85b243a1cf0f40313afd2bae68 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 20 Dec 2017 14:28:25 -0800
Subject: [PATCH 099/305] crypto: pcrypt - fix freeing pcrypt instances

pcrypt is using the old way of freeing instances, where the ->free()
method specified in the 'struct crypto_template' is passed a pointer to
the 'struct crypto_instance'.  But the crypto_instance is being
kfree()'d directly, which is incorrect because the memory was actually
allocated as an aead_instance, which contains the crypto_instance at a
nonzero offset.  Thus, the wrong pointer was being kfree()'d.

Fix it by switching to the new way to free aead_instance's where the
->free() method is specified in the aead_instance itself.

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 0496f56065e0 ("crypto: pcrypt - Add support for new AEAD interface")
Cc: <stable@vger.kernel.org> # v4.2+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/pcrypt.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index ee9cfb99fe25..f8ec3d4ba4a8 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -254,6 +254,14 @@ static void pcrypt_aead_exit_tfm(struct crypto_aead *tfm)
 	crypto_free_aead(ctx->child);
 }
 
+static void pcrypt_free(struct aead_instance *inst)
+{
+	struct pcrypt_instance_ctx *ctx = aead_instance_ctx(inst);
+
+	crypto_drop_aead(&ctx->spawn);
+	kfree(inst);
+}
+
 static int pcrypt_init_instance(struct crypto_instance *inst,
 				struct crypto_alg *alg)
 {
@@ -319,6 +327,8 @@ static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb,
 	inst->alg.encrypt = pcrypt_aead_encrypt;
 	inst->alg.decrypt = pcrypt_aead_decrypt;
 
+	inst->free = pcrypt_free;
+
 	err = aead_register_instance(tmpl, inst);
 	if (err)
 		goto out_drop_aead;
@@ -349,14 +359,6 @@ static int pcrypt_create(struct crypto_template *tmpl, struct rtattr **tb)
 	return -EINVAL;
 }
 
-static void pcrypt_free(struct crypto_instance *inst)
-{
-	struct pcrypt_instance_ctx *ctx = crypto_instance_ctx(inst);
-
-	crypto_drop_aead(&ctx->spawn);
-	kfree(inst);
-}
-
 static int pcrypt_cpumask_change_notify(struct notifier_block *self,
 					unsigned long val, void *data)
 {
@@ -469,7 +471,6 @@ static void pcrypt_fini_padata(struct padata_pcrypt *pcrypt)
 static struct crypto_template pcrypt_tmpl = {
 	.name = "pcrypt",
 	.create = pcrypt_create,
-	.free = pcrypt_free,
 	.module = THIS_MODULE,
 };
 

From 87c059e9c39dae20b8b9bd19d9ec55a6d6c10468 Mon Sep 17 00:00:00 2001
From: Bogdan Mirea <Bogdan-Stefan_Mirea@mentor.com>
Date: Thu, 21 Dec 2017 17:18:58 +0200
Subject: [PATCH 100/305] arm64: dts: renesas: salvator-x: Remove renesas,
 no-ether-link property

The present change is a bug fix for AVB link iteratively up/down.

Steps to reproduce:
- start AVB TX stream (Using aplay via MSE),
- disconnect+reconnect the eth cable,
- after a reconnection the eth connection goes iteratively up/down
  without user interaction,
- this may heal after some seconds or even stay for minutes.

As the documentation specifies, the "renesas,no-ether-link" option
should be used when a board does not provide a proper AVB_LINK signal.
There is no need for this option enabled on RCAR H3/M3 Salvator-X/XS
and ULCB starter kits since the AVB_LINK is correctly handled by HW.

Choosing to keep or remove the "renesas,no-ether-link" option will
have impact on the code flow in the following ways:
- keeping this option enabled may lead to unexpected behavior since
  the RX & TX are enabled/disabled directly from adjust_link function
  without any HW interrogation,
- removing this option, the RX & TX will only be enabled/disabled after
  HW interrogation. The HW check is made through the LMON pin in PSR
  register which specifies AVB_LINK signal value (0 - at low level;
  1 - at high level).

In conclusion, the present change is also a safety improvement because
it removes the "renesas,no-ether-link" option leading to a proper way
of detecting the link state based on HW interrogation and not on
software heuristic.

Fixes: dc36965a8905 ("arm64: dts: r8a7796: salvator-x: Enable EthernetAVB")
Fixes: 6fa501c549aa ("arm64: dts: r8a7795: enable EthernetAVB on Salvator-X")
Signed-off-by: Bogdan Mirea <Bogdan-Stefan_Mirea@mentor.com>
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 arch/arm64/boot/dts/renesas/salvator-common.dtsi | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/boot/dts/renesas/salvator-common.dtsi b/arch/arm64/boot/dts/renesas/salvator-common.dtsi
index a298df74ca6c..dbe2648649db 100644
--- a/arch/arm64/boot/dts/renesas/salvator-common.dtsi
+++ b/arch/arm64/boot/dts/renesas/salvator-common.dtsi
@@ -255,7 +255,6 @@
 &avb {
 	pinctrl-0 = <&avb_pins>;
 	pinctrl-names = "default";
-	renesas,no-ether-link;
 	phy-handle = <&phy0>;
 	status = "okay";
 

From bbc25bee37d2b32cf3a1fab9195b6da3a185614a Mon Sep 17 00:00:00 2001
From: James Hogan <jhogan@kernel.org>
Date: Tue, 5 Dec 2017 23:31:35 +0000
Subject: [PATCH 101/305] lib/mpi: Fix umul_ppmm() for MIPS64r6

Current MIPS64r6 toolchains aren't able to generate efficient
DMULU/DMUHU based code for the C implementation of umul_ppmm(), which
performs an unsigned 64 x 64 bit multiply and returns the upper and
lower 64-bit halves of the 128-bit result. Instead it widens the 64-bit
inputs to 128-bits and emits a __multi3 intrinsic call to perform a 128
x 128 multiply. This is both inefficient, and it results in a link error
since we don't include __multi3 in MIPS linux.

For example commit 90a53e4432b1 ("cfg80211: implement regdb signature
checking") merged in v4.15-rc1 recently broke the 64r6_defconfig and
64r6el_defconfig builds by indirectly selecting MPILIB. The same build
errors can be reproduced on older kernels by enabling e.g. CRYPTO_RSA:

lib/mpi/generic_mpih-mul1.o: In function `mpihelp_mul_1':
lib/mpi/generic_mpih-mul1.c:50: undefined reference to `__multi3'
lib/mpi/generic_mpih-mul2.o: In function `mpihelp_addmul_1':
lib/mpi/generic_mpih-mul2.c:49: undefined reference to `__multi3'
lib/mpi/generic_mpih-mul3.o: In function `mpihelp_submul_1':
lib/mpi/generic_mpih-mul3.c:49: undefined reference to `__multi3'
lib/mpi/mpih-div.o In function `mpihelp_divrem':
lib/mpi/mpih-div.c:205: undefined reference to `__multi3'
lib/mpi/mpih-div.c:142: undefined reference to `__multi3'

Therefore add an efficient MIPS64r6 implementation of umul_ppmm() using
inline assembly and the DMULU/DMUHU instructions, to prevent __multi3
calls being emitted.

Fixes: 7fd08ca58ae6 ("MIPS: Add build support for the MIPS R6 ISA")
Signed-off-by: James Hogan <jhogan@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-mips@linux-mips.org
Cc: linux-crypto@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 lib/mpi/longlong.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/lib/mpi/longlong.h b/lib/mpi/longlong.h
index 57fd45ab7af1..08c60d10747f 100644
--- a/lib/mpi/longlong.h
+++ b/lib/mpi/longlong.h
@@ -671,7 +671,23 @@ do {						\
 	**************  MIPS/64  **************
 	***************************************/
 #if (defined(__mips) && __mips >= 3) && W_TYPE_SIZE == 64
-#if (__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
+#if defined(__mips_isa_rev) && __mips_isa_rev >= 6
+/*
+ * GCC ends up emitting a __multi3 intrinsic call for MIPS64r6 with the plain C
+ * code below, so we special case MIPS64r6 until the compiler can do better.
+ */
+#define umul_ppmm(w1, w0, u, v)						\
+do {									\
+	__asm__ ("dmulu %0,%1,%2"					\
+		 : "=d" ((UDItype)(w0))					\
+		 : "d" ((UDItype)(u)),					\
+		   "d" ((UDItype)(v)));					\
+	__asm__ ("dmuhu %0,%1,%2"					\
+		 : "=d" ((UDItype)(w1))					\
+		 : "d" ((UDItype)(u)),					\
+		   "d" ((UDItype)(v)));					\
+} while (0)
+#elif (__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
 #define umul_ppmm(w1, w0, u, v) \
 do {									\
 	typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\

From 7d2901f809c110bd9a261e879d59efe62e3bc758 Mon Sep 17 00:00:00 2001
From: Bogdan Mirea <Bogdan-Stefan_Mirea@mentor.com>
Date: Thu, 21 Dec 2017 17:18:59 +0200
Subject: [PATCH 102/305] arm64: dts: renesas: ulcb: Remove renesas,
 no-ether-link property

The present change is a bug fix for AVB link iteratively up/down.

Steps to reproduce:
- start AVB TX stream (Using aplay via MSE),
- disconnect+reconnect the eth cable,
- after a reconnection the eth connection goes iteratively up/down
  without user interaction,
- this may heal after some seconds or even stay for minutes.

As the documentation specifies, the "renesas,no-ether-link" option
should be used when a board does not provide a proper AVB_LINK signal.
There is no need for this option enabled on RCAR H3/M3 Salvator-X/XS
and ULCB starter kits since the AVB_LINK is correctly handled by HW.

Choosing to keep or remove the "renesas,no-ether-link" option will
have impact on the code flow in the following ways:
- keeping this option enabled may lead to unexpected behavior since
  the RX & TX are enabled/disabled directly from adjust_link function
  without any HW interrogation,
- removing this option, the RX & TX will only be enabled/disabled after
  HW interrogation. The HW check is made through the LMON pin in PSR
  register which specifies AVB_LINK signal value (0 - at low level;
  1 - at high level).

In conclusion, the present change is also a safety improvement because
it removes the "renesas,no-ether-link" option leading to a proper way
of detecting the link state based on HW interrogation and not on
software heuristic.

Fixes: dc36965a8905 ("arm64: dts: r8a7796: salvator-x: Enable EthernetAVB")
Fixes: 6fa501c549aa ("arm64: dts: r8a7795: enable EthernetAVB on Salvator-X")
Signed-off-by: Bogdan Mirea <Bogdan-Stefan_Mirea@mentor.com>
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 arch/arm64/boot/dts/renesas/ulcb.dtsi | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/boot/dts/renesas/ulcb.dtsi b/arch/arm64/boot/dts/renesas/ulcb.dtsi
index 0d85b315ce71..73439cf48659 100644
--- a/arch/arm64/boot/dts/renesas/ulcb.dtsi
+++ b/arch/arm64/boot/dts/renesas/ulcb.dtsi
@@ -145,7 +145,6 @@
 &avb {
 	pinctrl-0 = <&avb_pins>;
 	pinctrl-names = "default";
-	renesas,no-ether-link;
 	phy-handle = <&phy0>;
 	status = "okay";
 

From 1eb7b40386c97f6c4d1c62931bf306f4535a4bd6 Mon Sep 17 00:00:00 2001
From: Ofer Heifetz <oferh@marvell.com>
Date: Mon, 11 Dec 2017 12:10:55 +0100
Subject: [PATCH 103/305] crypto: inside-secure - per request invalidation

When an invalidation request is needed we currently override the context
.send and .handle_result helpers. This is wrong as under high load other
requests can already be queued and overriding the context helpers will
make them execute the wrong .send and .handle_result functions.

This commit fixes this by adding a needs_inv flag in the request to
choose the action to perform when sending requests or handling their
results. This flag will be set when needed (i.e. when the context flag
will be set).

Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver")
Signed-off-by: Ofer Heifetz <oferh@marvell.com>
[Antoine: commit message, and removed non related changes from the
original commit]
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 .../crypto/inside-secure/safexcel_cipher.c    | 71 +++++++++++++++----
 drivers/crypto/inside-secure/safexcel_hash.c  | 67 +++++++++++++----
 2 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c
index 5438552bc6d7..9ea24868d860 100644
--- a/drivers/crypto/inside-secure/safexcel_cipher.c
+++ b/drivers/crypto/inside-secure/safexcel_cipher.c
@@ -14,6 +14,7 @@
 
 #include <crypto/aes.h>
 #include <crypto/skcipher.h>
+#include <crypto/internal/skcipher.h>
 
 #include "safexcel.h"
 
@@ -33,6 +34,10 @@ struct safexcel_cipher_ctx {
 	unsigned int key_len;
 };
 
+struct safexcel_cipher_req {
+	bool needs_inv;
+};
+
 static void safexcel_cipher_token(struct safexcel_cipher_ctx *ctx,
 				  struct crypto_async_request *async,
 				  struct safexcel_command_desc *cdesc,
@@ -126,9 +131,9 @@ static int safexcel_context_control(struct safexcel_cipher_ctx *ctx,
 	return 0;
 }
 
-static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring,
-				  struct crypto_async_request *async,
-				  bool *should_complete, int *ret)
+static int safexcel_handle_req_result(struct safexcel_crypto_priv *priv, int ring,
+				      struct crypto_async_request *async,
+				      bool *should_complete, int *ret)
 {
 	struct skcipher_request *req = skcipher_request_cast(async);
 	struct safexcel_result_desc *rdesc;
@@ -265,7 +270,6 @@ static int safexcel_aes_send(struct crypto_async_request *async,
 	spin_unlock_bh(&priv->ring[ring].egress_lock);
 
 	request->req = &req->base;
-	ctx->base.handle_result = safexcel_handle_result;
 
 	*commands = n_cdesc;
 	*results = n_rdesc;
@@ -341,8 +345,6 @@ static int safexcel_handle_inv_result(struct safexcel_crypto_priv *priv,
 
 	ring = safexcel_select_ring(priv);
 	ctx->base.ring = ring;
-	ctx->base.needs_inv = false;
-	ctx->base.send = safexcel_aes_send;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
 	enq_ret = crypto_enqueue_request(&priv->ring[ring].queue, async);
@@ -359,6 +361,26 @@ static int safexcel_handle_inv_result(struct safexcel_crypto_priv *priv,
 	return ndesc;
 }
 
+static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring,
+				  struct crypto_async_request *async,
+				  bool *should_complete, int *ret)
+{
+	struct skcipher_request *req = skcipher_request_cast(async);
+	struct safexcel_cipher_req *sreq = skcipher_request_ctx(req);
+	int err;
+
+	if (sreq->needs_inv) {
+		sreq->needs_inv = false;
+		err = safexcel_handle_inv_result(priv, ring, async,
+						 should_complete, ret);
+	} else {
+		err = safexcel_handle_req_result(priv, ring, async,
+						 should_complete, ret);
+	}
+
+	return err;
+}
+
 static int safexcel_cipher_send_inv(struct crypto_async_request *async,
 				    int ring, struct safexcel_request *request,
 				    int *commands, int *results)
@@ -368,8 +390,6 @@ static int safexcel_cipher_send_inv(struct crypto_async_request *async,
 	struct safexcel_crypto_priv *priv = ctx->priv;
 	int ret;
 
-	ctx->base.handle_result = safexcel_handle_inv_result;
-
 	ret = safexcel_invalidate_cache(async, &ctx->base, priv,
 					ctx->base.ctxr_dma, ring, request);
 	if (unlikely(ret))
@@ -381,11 +401,29 @@ static int safexcel_cipher_send_inv(struct crypto_async_request *async,
 	return 0;
 }
 
+static int safexcel_send(struct crypto_async_request *async,
+			 int ring, struct safexcel_request *request,
+			 int *commands, int *results)
+{
+	struct skcipher_request *req = skcipher_request_cast(async);
+	struct safexcel_cipher_req *sreq = skcipher_request_ctx(req);
+	int ret;
+
+	if (sreq->needs_inv)
+		ret = safexcel_cipher_send_inv(async, ring, request,
+					       commands, results);
+	else
+		ret = safexcel_aes_send(async, ring, request,
+					commands, results);
+	return ret;
+}
+
 static int safexcel_cipher_exit_inv(struct crypto_tfm *tfm)
 {
 	struct safexcel_cipher_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct safexcel_crypto_priv *priv = ctx->priv;
 	struct skcipher_request req;
+	struct safexcel_cipher_req *sreq = skcipher_request_ctx(&req);
 	struct safexcel_inv_result result = {};
 	int ring = ctx->base.ring;
 
@@ -399,7 +437,7 @@ static int safexcel_cipher_exit_inv(struct crypto_tfm *tfm)
 	skcipher_request_set_tfm(&req, __crypto_skcipher_cast(tfm));
 	ctx = crypto_tfm_ctx(req.base.tfm);
 	ctx->base.exit_inv = true;
-	ctx->base.send = safexcel_cipher_send_inv;
+	sreq->needs_inv = true;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
 	crypto_enqueue_request(&priv->ring[ring].queue, &req.base);
@@ -424,19 +462,21 @@ static int safexcel_aes(struct skcipher_request *req,
 			enum safexcel_cipher_direction dir, u32 mode)
 {
 	struct safexcel_cipher_ctx *ctx = crypto_tfm_ctx(req->base.tfm);
+	struct safexcel_cipher_req *sreq = skcipher_request_ctx(req);
 	struct safexcel_crypto_priv *priv = ctx->priv;
 	int ret, ring;
 
+	sreq->needs_inv = false;
 	ctx->direction = dir;
 	ctx->mode = mode;
 
 	if (ctx->base.ctxr) {
-		if (ctx->base.needs_inv)
-			ctx->base.send = safexcel_cipher_send_inv;
+		if (ctx->base.needs_inv) {
+			sreq->needs_inv = true;
+			ctx->base.needs_inv = false;
+		}
 	} else {
 		ctx->base.ring = safexcel_select_ring(priv);
-		ctx->base.send = safexcel_aes_send;
-
 		ctx->base.ctxr = dma_pool_zalloc(priv->context_pool,
 						 EIP197_GFP_FLAGS(req->base),
 						 &ctx->base.ctxr_dma);
@@ -476,6 +516,11 @@ static int safexcel_skcipher_cra_init(struct crypto_tfm *tfm)
 			     alg.skcipher.base);
 
 	ctx->priv = tmpl->priv;
+	ctx->base.send = safexcel_send;
+	ctx->base.handle_result = safexcel_handle_result;
+
+	crypto_skcipher_set_reqsize(__crypto_skcipher_cast(tfm),
+				    sizeof(struct safexcel_cipher_req));
 
 	return 0;
 }
diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c
index 74feb6227101..79fe149804d3 100644
--- a/drivers/crypto/inside-secure/safexcel_hash.c
+++ b/drivers/crypto/inside-secure/safexcel_hash.c
@@ -32,6 +32,7 @@ struct safexcel_ahash_req {
 	bool last_req;
 	bool finish;
 	bool hmac;
+	bool needs_inv;
 
 	u8 state_sz;    /* expected sate size, only set once */
 	u32 state[SHA256_DIGEST_SIZE / sizeof(u32)];
@@ -119,9 +120,9 @@ static void safexcel_context_control(struct safexcel_ahash_ctx *ctx,
 	}
 }
 
-static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring,
-				  struct crypto_async_request *async,
-				  bool *should_complete, int *ret)
+static int safexcel_handle_req_result(struct safexcel_crypto_priv *priv, int ring,
+				      struct crypto_async_request *async,
+				      bool *should_complete, int *ret)
 {
 	struct safexcel_result_desc *rdesc;
 	struct ahash_request *areq = ahash_request_cast(async);
@@ -165,9 +166,9 @@ static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring,
 	return 1;
 }
 
-static int safexcel_ahash_send(struct crypto_async_request *async, int ring,
-			       struct safexcel_request *request, int *commands,
-			       int *results)
+static int safexcel_ahash_send_req(struct crypto_async_request *async, int ring,
+				   struct safexcel_request *request,
+				   int *commands, int *results)
 {
 	struct ahash_request *areq = ahash_request_cast(async);
 	struct crypto_ahash *ahash = crypto_ahash_reqtfm(areq);
@@ -292,7 +293,6 @@ send_command:
 
 	req->processed += len;
 	request->req = &areq->base;
-	ctx->base.handle_result = safexcel_handle_result;
 
 	*commands = n_cdesc;
 	*results = 1;
@@ -374,8 +374,6 @@ static int safexcel_handle_inv_result(struct safexcel_crypto_priv *priv,
 
 	ring = safexcel_select_ring(priv);
 	ctx->base.ring = ring;
-	ctx->base.needs_inv = false;
-	ctx->base.send = safexcel_ahash_send;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
 	enq_ret = crypto_enqueue_request(&priv->ring[ring].queue, async);
@@ -392,6 +390,26 @@ static int safexcel_handle_inv_result(struct safexcel_crypto_priv *priv,
 	return 1;
 }
 
+static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring,
+				  struct crypto_async_request *async,
+				  bool *should_complete, int *ret)
+{
+	struct ahash_request *areq = ahash_request_cast(async);
+	struct safexcel_ahash_req *req = ahash_request_ctx(areq);
+	int err;
+
+	if (req->needs_inv) {
+		req->needs_inv = false;
+		err = safexcel_handle_inv_result(priv, ring, async,
+						 should_complete, ret);
+	} else {
+		err = safexcel_handle_req_result(priv, ring, async,
+						 should_complete, ret);
+	}
+
+	return err;
+}
+
 static int safexcel_ahash_send_inv(struct crypto_async_request *async,
 				   int ring, struct safexcel_request *request,
 				   int *commands, int *results)
@@ -400,7 +418,6 @@ static int safexcel_ahash_send_inv(struct crypto_async_request *async,
 	struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq));
 	int ret;
 
-	ctx->base.handle_result = safexcel_handle_inv_result;
 	ret = safexcel_invalidate_cache(async, &ctx->base, ctx->priv,
 					ctx->base.ctxr_dma, ring, request);
 	if (unlikely(ret))
@@ -412,11 +429,29 @@ static int safexcel_ahash_send_inv(struct crypto_async_request *async,
 	return 0;
 }
 
+static int safexcel_ahash_send(struct crypto_async_request *async,
+			       int ring, struct safexcel_request *request,
+			       int *commands, int *results)
+{
+	struct ahash_request *areq = ahash_request_cast(async);
+	struct safexcel_ahash_req *req = ahash_request_ctx(areq);
+	int ret;
+
+	if (req->needs_inv)
+		ret = safexcel_ahash_send_inv(async, ring, request,
+					      commands, results);
+	else
+		ret = safexcel_ahash_send_req(async, ring, request,
+					      commands, results);
+	return ret;
+}
+
 static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm)
 {
 	struct safexcel_ahash_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct safexcel_crypto_priv *priv = ctx->priv;
 	struct ahash_request req;
+	struct safexcel_ahash_req *rctx = ahash_request_ctx(&req);
 	struct safexcel_inv_result result = {};
 	int ring = ctx->base.ring;
 
@@ -430,7 +465,7 @@ static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm)
 	ahash_request_set_tfm(&req, __crypto_ahash_cast(tfm));
 	ctx = crypto_tfm_ctx(req.base.tfm);
 	ctx->base.exit_inv = true;
-	ctx->base.send = safexcel_ahash_send_inv;
+	rctx->needs_inv = true;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
 	crypto_enqueue_request(&priv->ring[ring].queue, &req.base);
@@ -481,14 +516,16 @@ static int safexcel_ahash_enqueue(struct ahash_request *areq)
 	struct safexcel_crypto_priv *priv = ctx->priv;
 	int ret, ring;
 
-	ctx->base.send = safexcel_ahash_send;
+	req->needs_inv = false;
 
 	if (req->processed && ctx->digest == CONTEXT_CONTROL_DIGEST_PRECOMPUTED)
 		ctx->base.needs_inv = safexcel_ahash_needs_inv_get(areq);
 
 	if (ctx->base.ctxr) {
-		if (ctx->base.needs_inv)
-			ctx->base.send = safexcel_ahash_send_inv;
+		if (ctx->base.needs_inv) {
+			ctx->base.needs_inv = false;
+			req->needs_inv = true;
+		}
 	} else {
 		ctx->base.ring = safexcel_select_ring(priv);
 		ctx->base.ctxr = dma_pool_zalloc(priv->context_pool,
@@ -622,6 +659,8 @@ static int safexcel_ahash_cra_init(struct crypto_tfm *tfm)
 			     struct safexcel_alg_template, alg.ahash);
 
 	ctx->priv = tmpl->priv;
+	ctx->base.send = safexcel_ahash_send;
+	ctx->base.handle_result = safexcel_handle_result;
 
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
 				 sizeof(struct safexcel_ahash_req));

From 0a02dcca126280595950f3ea809f77c9cb0a235c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antoine=20T=C3=A9nart?= <antoine.tenart@free-electrons.com>
Date: Mon, 11 Dec 2017 12:10:56 +0100
Subject: [PATCH 104/305] crypto: inside-secure - free requests even if their
 handling failed

This patch frees the request private data even if its handling failed,
as it would never be freed otherwise.

Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver")
Suggested-by: Ofer Heifetz <oferh@marvell.com>
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/inside-secure/safexcel.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c
index 89ba9e85c0f3..4bcef78a08aa 100644
--- a/drivers/crypto/inside-secure/safexcel.c
+++ b/drivers/crypto/inside-secure/safexcel.c
@@ -607,6 +607,7 @@ static inline void safexcel_handle_result_descriptor(struct safexcel_crypto_priv
 		ndesc = ctx->handle_result(priv, ring, sreq->req,
 					   &should_complete, &ret);
 		if (ndesc < 0) {
+			kfree(sreq);
 			dev_err(priv->dev, "failed to handle result (%d)", ndesc);
 			return;
 		}

From 7cad2fabd5691dbb17762877d4e7f236fe4bc181 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antoine=20T=C3=A9nart?= <antoine.tenart@free-electrons.com>
Date: Mon, 11 Dec 2017 12:10:57 +0100
Subject: [PATCH 105/305] crypto: inside-secure - fix request allocations in
 invalidation path

This patch makes use of the SKCIPHER_REQUEST_ON_STACK and
AHASH_REQUEST_ON_STACK helpers to allocate enough memory to contain both
the crypto request structures and their embedded context (__ctx).

Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver")
Suggested-by: Ofer Heifetz <oferh@marvell.com>
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/inside-secure/safexcel_cipher.c | 16 ++++++++--------
 drivers/crypto/inside-secure/safexcel_hash.c   | 14 +++++++-------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c
index 9ea24868d860..fcc0a606d748 100644
--- a/drivers/crypto/inside-secure/safexcel_cipher.c
+++ b/drivers/crypto/inside-secure/safexcel_cipher.c
@@ -422,25 +422,25 @@ static int safexcel_cipher_exit_inv(struct crypto_tfm *tfm)
 {
 	struct safexcel_cipher_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct safexcel_crypto_priv *priv = ctx->priv;
-	struct skcipher_request req;
-	struct safexcel_cipher_req *sreq = skcipher_request_ctx(&req);
+	SKCIPHER_REQUEST_ON_STACK(req, __crypto_skcipher_cast(tfm));
+	struct safexcel_cipher_req *sreq = skcipher_request_ctx(req);
 	struct safexcel_inv_result result = {};
 	int ring = ctx->base.ring;
 
-	memset(&req, 0, sizeof(struct skcipher_request));
+	memset(req, 0, sizeof(struct skcipher_request));
 
 	/* create invalidation request */
 	init_completion(&result.completion);
-	skcipher_request_set_callback(&req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-					safexcel_inv_complete, &result);
+	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				      safexcel_inv_complete, &result);
 
-	skcipher_request_set_tfm(&req, __crypto_skcipher_cast(tfm));
-	ctx = crypto_tfm_ctx(req.base.tfm);
+	skcipher_request_set_tfm(req, __crypto_skcipher_cast(tfm));
+	ctx = crypto_tfm_ctx(req->base.tfm);
 	ctx->base.exit_inv = true;
 	sreq->needs_inv = true;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
-	crypto_enqueue_request(&priv->ring[ring].queue, &req.base);
+	crypto_enqueue_request(&priv->ring[ring].queue, &req->base);
 	spin_unlock_bh(&priv->ring[ring].queue_lock);
 
 	if (!priv->ring[ring].need_dequeue)
diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c
index 79fe149804d3..55ff8a340b11 100644
--- a/drivers/crypto/inside-secure/safexcel_hash.c
+++ b/drivers/crypto/inside-secure/safexcel_hash.c
@@ -450,25 +450,25 @@ static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm)
 {
 	struct safexcel_ahash_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct safexcel_crypto_priv *priv = ctx->priv;
-	struct ahash_request req;
-	struct safexcel_ahash_req *rctx = ahash_request_ctx(&req);
+	AHASH_REQUEST_ON_STACK(req, __crypto_ahash_cast(tfm));
+	struct safexcel_ahash_req *rctx = ahash_request_ctx(req);
 	struct safexcel_inv_result result = {};
 	int ring = ctx->base.ring;
 
-	memset(&req, 0, sizeof(struct ahash_request));
+	memset(req, 0, sizeof(struct ahash_request));
 
 	/* create invalidation request */
 	init_completion(&result.completion);
-	ahash_request_set_callback(&req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   safexcel_inv_complete, &result);
 
-	ahash_request_set_tfm(&req, __crypto_ahash_cast(tfm));
-	ctx = crypto_tfm_ctx(req.base.tfm);
+	ahash_request_set_tfm(req, __crypto_ahash_cast(tfm));
+	ctx = crypto_tfm_ctx(req->base.tfm);
 	ctx->base.exit_inv = true;
 	rctx->needs_inv = true;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
-	crypto_enqueue_request(&priv->ring[ring].queue, &req.base);
+	crypto_enqueue_request(&priv->ring[ring].queue, &req->base);
 	spin_unlock_bh(&priv->ring[ring].queue_lock);
 
 	if (!priv->ring[ring].need_dequeue)

From 2973633e9f09311e849f975d969737af81a521ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antoine=20T=C3=A9nart?= <antoine.tenart@free-electrons.com>
Date: Mon, 11 Dec 2017 12:10:58 +0100
Subject: [PATCH 106/305] crypto: inside-secure - do not use areq->result for
 partial results

This patches update the SafeXcel driver to stop using the crypto
ahash_request result field for partial results (i.e. on updates).
Instead the driver local safexcel_ahash_req state field is used, and
only on final operations the ahash_request result buffer is updated.

Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver")
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/inside-secure/safexcel_hash.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c
index 55ff8a340b11..0c5a5820b06e 100644
--- a/drivers/crypto/inside-secure/safexcel_hash.c
+++ b/drivers/crypto/inside-secure/safexcel_hash.c
@@ -35,7 +35,7 @@ struct safexcel_ahash_req {
 	bool needs_inv;
 
 	u8 state_sz;    /* expected sate size, only set once */
-	u32 state[SHA256_DIGEST_SIZE / sizeof(u32)];
+	u32 state[SHA256_DIGEST_SIZE / sizeof(u32)] __aligned(sizeof(u32));
 
 	u64 len;
 	u64 processed;
@@ -128,7 +128,7 @@ static int safexcel_handle_req_result(struct safexcel_crypto_priv *priv, int rin
 	struct ahash_request *areq = ahash_request_cast(async);
 	struct crypto_ahash *ahash = crypto_ahash_reqtfm(areq);
 	struct safexcel_ahash_req *sreq = ahash_request_ctx(areq);
-	int cache_len, result_sz = sreq->state_sz;
+	int cache_len;
 
 	*ret = 0;
 
@@ -149,8 +149,8 @@ static int safexcel_handle_req_result(struct safexcel_crypto_priv *priv, int rin
 	spin_unlock_bh(&priv->ring[ring].egress_lock);
 
 	if (sreq->finish)
-		result_sz = crypto_ahash_digestsize(ahash);
-	memcpy(sreq->state, areq->result, result_sz);
+		memcpy(areq->result, sreq->state,
+		       crypto_ahash_digestsize(ahash));
 
 	dma_unmap_sg(priv->dev, areq->src,
 		     sg_nents_for_len(areq->src, areq->nbytes), DMA_TO_DEVICE);
@@ -274,7 +274,7 @@ send_command:
 	/* Add the token */
 	safexcel_hash_token(first_cdesc, len, req->state_sz);
 
-	ctx->base.result_dma = dma_map_single(priv->dev, areq->result,
+	ctx->base.result_dma = dma_map_single(priv->dev, req->state,
 					      req->state_sz, DMA_FROM_DEVICE);
 	if (dma_mapping_error(priv->dev, ctx->base.result_dma)) {
 		ret = -EINVAL;

From 32aa144fc32abfcbf7140f473dfbd94c5b9b4105 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 15 Dec 2017 13:14:31 +0100
Subject: [PATCH 107/305] KVM: s390: fix cmma migration for multiple memory
 slots

When multiple memory slots are present the cmma migration code
does not allocate enough memory for the bitmap. The memory slots
are sorted in reverse order, so we must use gfn and size of
slot[0] instead of the last one.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org # 4.13+
Fixes: 190df4a212a7 (KVM: s390: CMMA tracking, ESSA emulation, migration mode)
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index efa439f6ffb3..abcd24fdde3f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -792,11 +792,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
 
 	if (kvm->arch.use_cmma) {
 		/*
-		 * Get the last slot. They should be sorted by base_gfn, so the
-		 * last slot is also the one at the end of the address space.
-		 * We have verified above that at least one slot is present.
+		 * Get the first slot. They are reverse sorted by base_gfn, so
+		 * the first slot is also the one at the end of the address
+		 * space. We have verified above that at least one slot is
+		 * present.
 		 */
-		ms = slots->memslots + slots->used_slots - 1;
+		ms = slots->memslots;
 		/* round up so we only use full longs */
 		ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
 		/* allocate enough bytes to store all the bits */

From c2cf265d860882b51a200e4a7553c17827f2b730 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 21 Dec 2017 09:18:22 +0100
Subject: [PATCH 108/305] KVM: s390: prevent buffer overrun on memory hotplug
 during migration

We must not go beyond the pre-allocated buffer. This can happen when
a new memory slot is added during migration.

Reported-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: stable@vger.kernel.org # 4.13+
Fixes: 190df4a212a7 (KVM: s390: CMMA tracking, ESSA emulation, migration mode)
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/kvm/priv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 572496c688cc..0714bfa56da0 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -1006,7 +1006,7 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
 		cbrlo[entries] = gfn << PAGE_SHIFT;
 	}
 
-	if (orc) {
+	if (orc && gfn < ms->bitmap_size) {
 		/* increment only if we are really flipping the bit to 1 */
 		if (!test_and_set_bit(gfn, ms->pgste_bitmap))
 			atomic64_inc(&ms->dirty_pages);

From 8a42d3fc9dfccbf601c5f58f46dc3cdbc1a4b923 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Fri, 15 Dec 2017 13:42:04 +0000
Subject: [PATCH 109/305] nvmem: meson-mx-efuse: fix reading from an offset
 other than 0

meson_mx_efuse_read calculates the address internal to the eFuse based
on the offset and the word size. This works fine with any given offset.
However, the offset is also included when writing to the output buffer.
This means that reading 4 bytes at offset 500 tries to write beyond the
array allocated by the nvmem core as it wants to write the 4 bytes to
"buffer address + offset (500)".
This issue did not show up in the previous tests since no driver uses
any value from the eFuse yet and reading the eFuse via sysfs simply
reads the whole eFuse, starting at offset 0.

Fix this by only including the offset in the internal address
calculation.

Fixes: 8caef1fa9176 ("nvmem: add a driver for the Amlogic Meson6/Meson8/Meson8b SoCs")
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/meson-mx-efuse.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvmem/meson-mx-efuse.c b/drivers/nvmem/meson-mx-efuse.c
index a346b4923550..41d3a3c1104e 100644
--- a/drivers/nvmem/meson-mx-efuse.c
+++ b/drivers/nvmem/meson-mx-efuse.c
@@ -156,8 +156,8 @@ static int meson_mx_efuse_read(void *context, unsigned int offset,
 				 MESON_MX_EFUSE_CNTL1_AUTO_RD_ENABLE,
 				 MESON_MX_EFUSE_CNTL1_AUTO_RD_ENABLE);
 
-	for (i = offset; i < offset + bytes; i += efuse->config.word_size) {
-		addr = i / efuse->config.word_size;
+	for (i = 0; i < bytes; i += efuse->config.word_size) {
+		addr = (offset + i) / efuse->config.word_size;
 
 		err = meson_mx_efuse_read_addr(efuse, addr, &tmp);
 		if (err)

From a89f040fa34ec9cd682aed98b8f04e3c47d998bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:33 +0100
Subject: [PATCH 110/305] x86/cpufeatures: Add X86_BUG_CPU_INSECURE

Many x86 CPUs leak information to user space due to missing isolation of
user space and kernel space page tables. There are many well documented
ways to exploit that.

The upcoming software migitation of isolating the user and kernel space
page tables needs a misfeature flag so code can be made runtime
conditional.

Add the BUG bits which indicates that the CPU is affected and add a feature
bit which indicates that the software migitation is enabled.

Assume for now that _ALL_ x86 CPUs are affected by this. Exceptions can be
made later.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h       | 3 ++-
 arch/x86/include/asm/disabled-features.h | 8 +++++++-
 arch/x86/kernel/cpu/common.c             | 4 ++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 800104c8a3ed..d8ec834ea884 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -201,7 +201,7 @@
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
-
+#define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
@@ -340,5 +340,6 @@
 #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_INSECURE		X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index c10c9128f54e..e428e16dd822 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -44,6 +44,12 @@
 # define DISABLE_LA57	(1<<(X86_FEATURE_LA57 & 31))
 #endif
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define DISABLE_PTI		0
+#else
+# define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -54,7 +60,7 @@
 #define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK5	0
 #define DISABLED_MASK6	0
-#define DISABLED_MASK7	0
+#define DISABLED_MASK7	(DISABLE_PTI)
 #define DISABLED_MASK8	0
 #define DISABLED_MASK9	(DISABLE_MPX)
 #define DISABLED_MASK10	0
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8ddcfa4d4165..a9210f9b7cf8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -898,6 +898,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	}
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+
+	/* Assume for now that ALL x86 CPUs are insecure */
+	setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+
 	fpu__init_system(c);
 
 #ifdef CONFIG_X86_32

From c313ec66317d421fb5768d78c56abed2dc862264 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:34 +0100
Subject: [PATCH 111/305] x86/mm/pti: Disable global pages if
 PAGE_TABLE_ISOLATION=y

Global pages stay in the TLB across context switches.  Since all contexts
share the same kernel mapping, these mappings are marked as global pages
so kernel entries in the TLB are not flushed out on a context switch.

But, even having these entries in the TLB opens up something that an
attacker can use, such as the double-page-fault attack:

   http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf

That means that even when PAGE_TABLE_ISOLATION switches page tables
on return to user space the global pages would stay in the TLB cache.

Disable global pages so that kernel TLB entries can be flushed before
returning to user space. This way, all accesses to kernel addresses from
userspace result in a TLB miss independent of the existence of a kernel
mapping.

Suppress global pages via the __supported_pte_mask. The user space
mappings set PAGE_GLOBAL for the minimal kernel mappings which are
required for entry/exit. These mappings are set up manually so the
filtering does not take place.

[ The __supported_pte_mask simplification was written by Thomas Gleixner. ]
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a22c2b95e513..020223420308 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,6 +161,12 @@ struct map_range {
 
 static int page_size_mask;
 
+static void enable_global_pages(void)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		__supported_pte_mask |= _PAGE_GLOBAL;
+}
+
 static void __init probe_page_size_mask(void)
 {
 	/*
@@ -179,11 +185,11 @@ static void __init probe_page_size_mask(void)
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	/* Enable PGE if available */
+	__supported_pte_mask &= ~_PAGE_GLOBAL;
 	if (boot_cpu_has(X86_FEATURE_PGE)) {
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
-		__supported_pte_mask |= _PAGE_GLOBAL;
-	} else
-		__supported_pte_mask &= ~_PAGE_GLOBAL;
+		enable_global_pages();
+	}
 
 	/* Enable 1 GB linear kernel mappings if available: */
 	if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {

From 8a09317b895f073977346779df52f67c1056d81d Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:35 +0100
Subject: [PATCH 112/305] x86/mm/pti: Prepare the x86/entry assembly code for
 entry/exit CR3 switching

PAGE_TABLE_ISOLATION needs to switch to a different CR3 value when it
enters the kernel and switch back when it exits.  This essentially needs to
be done before leaving assembly code.

This is extra challenging because the switching context is tricky: the
registers that can be clobbered can vary.  It is also hard to store things
on the stack because there is an established ABI (ptregs) or the stack is
entirely unsafe to use.

Establish a set of macros that allow changing to the user and kernel CR3
values.

Interactions with SWAPGS:

  Previous versions of the PAGE_TABLE_ISOLATION code relied on having
  per-CPU scratch space to save/restore a register that can be used for the
  CR3 MOV.  The %GS register is used to index into our per-CPU space, so
  SWAPGS *had* to be done before the CR3 switch.  That scratch space is gone
  now, but the semantic that SWAPGS must be done before the CR3 MOV is
  retained.  This is good to keep because it is not that hard to do and it
  allows to do things like add per-CPU debugging information.

What this does in the NMI code is worth pointing out.  NMIs can interrupt
*any* context and they can also be nested with NMIs interrupting other
NMIs.  The comments below ".Lnmi_from_kernel" explain the format of the
stack during this situation.  Changing the format of this stack is hard.
Instead of storing the old CR3 value on the stack, this depends on the
*regular* register save/restore mechanism and then uses %r14 to keep CR3
during the NMI.  It is callee-saved and will not be clobbered by the C NMI
handlers that get called.

[ PeterZ: ESPFIX optimization ]

Based-on-code-from: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h         | 66 ++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_64.S        | 45 +++++++++++++++++++---
 arch/x86/entry/entry_64_compat.S | 24 +++++++++++-
 3 files changed, 128 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3fd8bc560fae..a9d17a7686ab 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/jump_label.h>
 #include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
 
 /*
 
@@ -187,6 +189,70 @@ For 32-bit we have the following conventions - kernel is built with
 #endif
 .endm
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+/* PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two halves: */
+#define PTI_SWITCH_MASK (1<<PAGE_SHIFT)
+
+.macro ADJUST_KERNEL_CR3 reg:req
+	/* Clear "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+	andq	$(~PTI_SWITCH_MASK), \reg
+.endm
+
+.macro ADJUST_USER_CR3 reg:req
+	/* Move CR3 up a page to the user page tables: */
+	orq	$(PTI_SWITCH_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+	mov	%cr3, \scratch_reg
+	ADJUST_KERNEL_CR3 \scratch_reg
+	mov	\scratch_reg, %cr3
+.endm
+
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+	mov	%cr3, \scratch_reg
+	ADJUST_USER_CR3 \scratch_reg
+	mov	\scratch_reg, %cr3
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+	movq	%cr3, \scratch_reg
+	movq	\scratch_reg, \save_reg
+	/*
+	 * Is the switch bit zero?  This means the address is
+	 * up in real PAGE_TABLE_ISOLATION patches in a moment.
+	 */
+	testq	$(PTI_SWITCH_MASK), \scratch_reg
+	jz	.Ldone_\@
+
+	ADJUST_KERNEL_CR3 \scratch_reg
+	movq	\scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 save_reg:req
+	/*
+	 * The CR3 write could be avoided when not changing its value,
+	 * but would require a CR3 read *and* a scratch register.
+	 */
+	movq	\save_reg, %cr3
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 save_reg:req
+.endm
+
+#endif
+
 #endif /* CONFIG_X86_64 */
 
 /*
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 87cebe78bbef..2ad7ad4d3dd6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -164,6 +164,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
 	/* Stash the user RSP. */
 	movq	%rsp, RSP_SCRATCH
 
+	/* Note: using %rsp as a scratch reg. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
 	/* Load the top of the task stack into RSP */
 	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
 
@@ -203,6 +206,10 @@ ENTRY(entry_SYSCALL_64)
 	 */
 
 	swapgs
+	/*
+	 * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
+	 * is not required to switch CR3.
+	 */
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
@@ -399,6 +406,7 @@ syscall_return_via_sysret:
 	 * We are on the trampoline stack.  All regs except RDI are live.
 	 * We can do future final exit work right here.
 	 */
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi
 
 	popq	%rdi
 	popq	%rsp
@@ -736,6 +744,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 	 * We can do future final exit work right here.
 	 */
 
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi
+
 	/* Restore RDI. */
 	popq	%rdi
 	SWAPGS
@@ -818,7 +828,9 @@ native_irq_return_ldt:
 	 */
 
 	pushq	%rdi				/* Stash user RDI */
-	SWAPGS
+	SWAPGS					/* to kernel GS */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */
+
 	movq	PER_CPU_VAR(espfix_waddr), %rdi
 	movq	%rax, (0*8)(%rdi)		/* user RAX */
 	movq	(1*8)(%rsp), %rax		/* user RIP */
@@ -834,7 +846,6 @@ native_irq_return_ldt:
 	/* Now RAX == RSP. */
 
 	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
-	popq	%rdi				/* Restore user RDI */
 
 	/*
 	 * espfix_stack[31:16] == 0.  The page tables are set up such that
@@ -845,7 +856,11 @@ native_irq_return_ldt:
 	 * still points to an RO alias of the ESPFIX stack.
 	 */
 	orq	PER_CPU_VAR(espfix_stack), %rax
-	SWAPGS
+
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi	/* to user CR3 */
+	SWAPGS					/* to user GS */
+	popq	%rdi				/* Restore user RDI */
+
 	movq	%rax, %rsp
 	UNWIND_HINT_IRET_REGS offset=8
 
@@ -945,6 +960,8 @@ ENTRY(switch_to_thread_stack)
 	UNWIND_HINT_FUNC
 
 	pushq	%rdi
+	/* Need to switch before accessing the thread stack. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
 	movq	%rsp, %rdi
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
@@ -1244,7 +1261,11 @@ ENTRY(paranoid_entry)
 	js	1f				/* negative -> in kernel */
 	SWAPGS
 	xorl	%ebx, %ebx
-1:	ret
+
+1:
+	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+
+	ret
 END(paranoid_entry)
 
 /*
@@ -1266,6 +1287,7 @@ ENTRY(paranoid_exit)
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	.Lparanoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
+	RESTORE_CR3	save_reg=%r14
 	SWAPGS_UNSAFE_STACK
 	jmp	.Lparanoid_exit_restore
 .Lparanoid_exit_no_swapgs:
@@ -1293,6 +1315,8 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 */
 	SWAPGS
+	/* We have user CR3.  Change to kernel CR3. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 
 .Lerror_entry_from_usermode_after_swapgs:
 	/* Put us onto the real thread stack. */
@@ -1339,6 +1363,7 @@ ENTRY(error_entry)
 	 * .Lgs_change's error handler with kernel gsbase.
 	 */
 	SWAPGS
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 	jmp .Lerror_entry_done
 
 .Lbstep_iret:
@@ -1348,10 +1373,11 @@ ENTRY(error_entry)
 
 .Lerror_bad_iret:
 	/*
-	 * We came from an IRET to user mode, so we have user gsbase.
-	 * Switch to kernel gsbase:
+	 * We came from an IRET to user mode, so we have user
+	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
 	 */
 	SWAPGS
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
@@ -1383,6 +1409,10 @@ END(error_exit)
 /*
  * Runs on exception stack.  Xen PV does not go through this path at all,
  * so we can use real assembly here.
+ *
+ * Registers:
+ *	%r14: Used to save/restore the CR3 of the interrupted context
+ *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
  */
 ENTRY(nmi)
 	UNWIND_HINT_IRET_REGS
@@ -1446,6 +1476,7 @@ ENTRY(nmi)
 
 	swapgs
 	cld
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -1698,6 +1729,8 @@ end_repeat_nmi:
 	movq	$-1, %rsi
 	call	do_nmi
 
+	RESTORE_CR3 save_reg=%r14
+
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 nmi_swapgs:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 95ad40eb7eff..05238b29895e 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -49,6 +49,10 @@
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
 	SWAPGS
+
+	/* We are about to clobber %rsp anyway, clobbering here is OK */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/*
@@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 
+	/*
+	 * We just saved %rdi so it is safe to clobber.  It is not
+	 * preserved during the C calls inside TRACE_IRQS_OFF anyway.
+	 */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * turned them off.
@@ -256,10 +266,22 @@ sysret32_from_system_call:
 	 * when the system call started, which is already known to user
 	 * code.  We zero R8-R10 to avoid info leaks.
          */
+	movq	RSP-ORIG_RAX(%rsp), %rsp
+
+	/*
+	 * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
+	 * on the process stack which is not mapped to userspace and
+	 * not readable after we SWITCH_TO_USER_CR3.  Delay the CR3
+	 * switch until after after the last reference to the process
+	 * stack.
+	 *
+	 * %r8 is zeroed before the sysret, thus safe to clobber.
+	 */
+	SWITCH_TO_USER_CR3 scratch_reg=%r8
+
 	xorq	%r8, %r8
 	xorq	%r9, %r9
 	xorq	%r10, %r10
-	movq	RSP-ORIG_RAX(%rsp), %rsp
 	swapgs
 	sysretl
 END(entry_SYSCALL_compat)

From aa8c6248f8c75acfd610fe15d8cae23cf70d9d09 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:36 +0100
Subject: [PATCH 113/305] x86/mm/pti: Add infrastructure for page table
 isolation

Add the initial files for kernel page table isolation, with a minimal init
function and the boot time detection for this misfeature.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         |  2 +
 arch/x86/boot/compressed/pagetable.c          |  3 +
 arch/x86/entry/calling.h                      |  7 ++
 arch/x86/include/asm/pti.h                    | 14 ++++
 arch/x86/mm/Makefile                          |  7 +-
 arch/x86/mm/init.c                            |  2 +
 arch/x86/mm/pti.c                             | 84 +++++++++++++++++++
 include/linux/pti.h                           | 11 +++
 init/main.c                                   |  3 +
 9 files changed, 130 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/include/asm/pti.h
 create mode 100644 arch/x86/mm/pti.c
 create mode 100644 include/linux/pti.h

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 05496622b4ef..5dfd26265484 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2685,6 +2685,8 @@
 			steal time is computed, but won't influence scheduler
 			behaviour
 
+	nopti		[X86-64] Disable kernel page table isolation
+
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 972319ff5b01..e691ff734cb5 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -23,6 +23,9 @@
  */
 #undef CONFIG_AMD_MEM_ENCRYPT
 
+/* No PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_PAGE_TABLE_ISOLATION
+
 #include "misc.h"
 
 /* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index a9d17a7686ab..3d3389a92c33 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -205,18 +205,23 @@ For 32-bit we have the following conventions - kernel is built with
 .endm
 
 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 	mov	%cr3, \scratch_reg
 	ADJUST_KERNEL_CR3 \scratch_reg
 	mov	\scratch_reg, %cr3
+.Lend_\@:
 .endm
 
 .macro SWITCH_TO_USER_CR3 scratch_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 	mov	%cr3, \scratch_reg
 	ADJUST_USER_CR3 \scratch_reg
 	mov	\scratch_reg, %cr3
+.Lend_\@:
 .endm
 
 .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
 	movq	%cr3, \scratch_reg
 	movq	\scratch_reg, \save_reg
 	/*
@@ -233,11 +238,13 @@ For 32-bit we have the following conventions - kernel is built with
 .endm
 
 .macro RESTORE_CR3 save_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 	/*
 	 * The CR3 write could be avoided when not changing its value,
 	 * but would require a CR3 read *and* a scratch register.
 	 */
 	movq	\save_reg, %cr3
+.Lend_\@:
 .endm
 
 #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
new file mode 100644
index 000000000000..0b5ef05b2d2d
--- /dev/null
+++ b/arch/x86/include/asm/pti.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _ASM_X86_PTI_H
+#define _ASM_X86_PTI_H
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern void pti_init(void);
+extern void pti_check_boottime_disable(void);
+#else
+static inline void pti_check_boottime_disable(void) { }
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PTI_H */
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 2e0017af8f9b..52906808e277 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -43,9 +43,10 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
-obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_X86_INTEL_MPX)			+= mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
+obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= pti.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 020223420308..af75069fb116 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -20,6 +20,7 @@
 #include <asm/kaslr.h>
 #include <asm/hypervisor.h>
 #include <asm/cpufeature.h>
+#include <asm/pti.h>
 
 /*
  * We need to define the tracepoints somewhere, and tlb.c
@@ -630,6 +631,7 @@ void __init init_mem_mapping(void)
 {
 	unsigned long end;
 
+	pti_check_boottime_disable();
 	probe_page_size_mask();
 	setup_pcid();
 
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
new file mode 100644
index 000000000000..375f23a758bc
--- /dev/null
+++ b/arch/x86/mm/pti.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * This code is based in part on work published here:
+ *
+ *	https://github.com/IAIK/KAISER
+ *
+ * The original work was written by and and signed off by for the Linux
+ * kernel by:
+ *
+ *   Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
+ *   Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
+ *   Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
+ *   Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
+ *
+ * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
+ * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
+ *		       Andy Lutomirsky <luto@amacapital.net>
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hypervisor.h>
+#include <asm/cmdline.h>
+#include <asm/pti.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
+
+static void __init pti_print_if_insecure(const char *reason)
+{
+	if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+		pr_info("%s\n", reason);
+}
+
+void __init pti_check_boottime_disable(void)
+{
+	if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
+		pti_print_if_insecure("disabled on XEN PV.");
+		return;
+	}
+
+	if (cmdline_find_option_bool(boot_command_line, "nopti")) {
+		pti_print_if_insecure("disabled on command line.");
+		return;
+	}
+
+	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+		return;
+
+	setup_force_cpu_cap(X86_FEATURE_PTI);
+}
+
+/*
+ * Initialize kernel page table isolation
+ */
+void __init pti_init(void)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	pr_info("enabled\n");
+}
diff --git a/include/linux/pti.h b/include/linux/pti.h
new file mode 100644
index 000000000000..0174883a935a
--- /dev/null
+++ b/include/linux/pti.h
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _INCLUDE_PTI_H
+#define _INCLUDE_PTI_H
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#include <asm/pti.h>
+#else
+static inline void pti_init(void) { }
+#endif
+
+#endif
diff --git a/init/main.c b/init/main.c
index 8a390f60ec81..b32ec72cdf3d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -75,6 +75,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
+#include <linux/pti.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/sched_clock.h>
@@ -506,6 +507,8 @@ static void __init mm_init(void)
 	ioremap_huge_init();
 	/* Should be run before the first non-init thread is created */
 	init_espfix_bsp();
+	/* Should be run after espfix64 is set up. */
+	pti_init();
 }
 
 asmlinkage __visible void __init start_kernel(void)

From 41f4c20b57a4890ea7f56ff8717cc83fefb8d537 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 12 Dec 2017 14:39:52 +0100
Subject: [PATCH 114/305] x86/pti: Add the pti= cmdline option and
 documentation

Keep the "nopti" optional for traditional reasons.

[ tglx: Don't allow force on when running on XEN PV and made 'on'
	printout conditional ]

Requested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171212133952.10177-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         |  6 +++++
 arch/x86/mm/pti.c                             | 26 ++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 5dfd26265484..520fdec15bbb 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3255,6 +3255,12 @@
 	pt.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
 
+	pti=		[X86_64]
+			Control user/kernel address space isolation:
+			on - enable
+			off - disable
+			auto - default setting
+
 	pty.legacy_count=
 			[KNL] Number of legacy pty's. Overwrites compiled-in
 			default number.
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 375f23a758bc..a13f6b109865 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -54,21 +54,45 @@ static void __init pti_print_if_insecure(const char *reason)
 		pr_info("%s\n", reason);
 }
 
+static void __init pti_print_if_secure(const char *reason)
+{
+	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+		pr_info("%s\n", reason);
+}
+
 void __init pti_check_boottime_disable(void)
 {
+	char arg[5];
+	int ret;
+
 	if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
 		pti_print_if_insecure("disabled on XEN PV.");
 		return;
 	}
 
+	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+	if (ret > 0)  {
+		if (ret == 3 && !strncmp(arg, "off", 3)) {
+			pti_print_if_insecure("disabled on command line.");
+			return;
+		}
+		if (ret == 2 && !strncmp(arg, "on", 2)) {
+			pti_print_if_secure("force enabled on command line.");
+			goto enable;
+		}
+		if (ret == 4 && !strncmp(arg, "auto", 4))
+			goto autosel;
+	}
+
 	if (cmdline_find_option_bool(boot_command_line, "nopti")) {
 		pti_print_if_insecure("disabled on command line.");
 		return;
 	}
 
+autosel:
 	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
 		return;
-
+enable:
 	setup_force_cpu_cap(X86_FEATURE_PTI);
 }
 

From 61e9b3671007a5da8127955a1a3bda7e0d5f42e8 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:37 +0100
Subject: [PATCH 115/305] x86/mm/pti: Add mapping helper functions

Add the pagetable helper functions do manage the separate user space page
tables.

[ tglx: Split out from the big combo kaiser patch. Folded Andys
	simplification and made it out of line as Boris suggested ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h    |  6 +-
 arch/x86/include/asm/pgtable_64.h | 92 +++++++++++++++++++++++++++++++
 arch/x86/mm/pti.c                 | 41 ++++++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f735c3016325..af38d93c4fbb 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -909,7 +909,11 @@ static inline int pgd_none(pgd_t pgd)
  * pgd_offset() returns a (pgd_t *)
  * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
  */
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
+/*
+ * a shortcut to get a pgd_t in a given mm
+ */
+#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
 /*
  * a shortcut which implies the use of the kernel's pgd, instead
  * of a process's
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e9f05331e732..81462e9a34f6 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
 #endif
 }
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
+ * the user one is in the last 4k.  To switch between them, you
+ * just need to flip the 12th bit in their addresses.
+ */
+#define PTI_PGTABLE_SWITCH_BIT	PAGE_SHIFT
+
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+	unsigned long __ptr = (unsigned long)ptr;
+
+	__ptr |= BIT(bit);
+	return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+	unsigned long __ptr = (unsigned long)ptr;
+
+	__ptr &= ~BIT(bit);
+	return (void *)__ptr;
+}
+
+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
+{
+	return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
+{
+	return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
+{
+	return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
+{
+	return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+	unsigned long ptr = (unsigned long)__ptr;
+
+	return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
+}
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
+
+/*
+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
+ * Populates the user and returns the resulting PGD that must be set in
+ * the kernel copy of the page tables.
+ */
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return pgd;
+	return __pti_set_user_pgd(pgdp, pgd);
+}
+#else
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	return pgd;
+}
+#endif
+
 static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 {
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
+	p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
+#else
 	*p4dp = p4d;
+#endif
 }
 
 static inline void native_p4d_clear(p4d_t *p4d)
@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	*pgdp = pti_set_user_pgd(pgdp, pgd);
+#else
 	*pgdp = pgd;
+#endif
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index a13f6b109865..69a983365392 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -96,6 +96,47 @@ enable:
 	setup_force_cpu_cap(X86_FEATURE_PTI);
 }
 
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	/*
+	 * Changes to the high (kernel) portion of the kernelmode page
+	 * tables are not automatically propagated to the usermode tables.
+	 *
+	 * Users should keep in mind that, unlike the kernelmode tables,
+	 * there is no vmalloc_fault equivalent for the usermode tables.
+	 * Top-level entries added to init_mm's usermode pgd after boot
+	 * will not be automatically propagated to other mms.
+	 */
+	if (!pgdp_maps_userspace(pgdp))
+		return pgd;
+
+	/*
+	 * The user page tables get the full PGD, accessible from
+	 * userspace:
+	 */
+	kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
+
+	/*
+	 * If this is normal user memory, make it NX in the kernel
+	 * pagetables so that, if we somehow screw up and return to
+	 * usermode with the kernel CR3 loaded, we'll get a page fault
+	 * instead of allowing user code to execute with the wrong CR3.
+	 *
+	 * As exceptions, we don't set NX if:
+	 *  - _PAGE_USER is not set.  This could be an executable
+	 *     EFI runtime mapping or something similar, and the kernel
+	 *     may execute from it
+	 *  - we don't have NX support
+	 *  - we're clearing the PGD (i.e. the new pgd is not present).
+	 */
+	if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
+	    (__supported_pte_mask & _PAGE_NX))
+		pgd.pgd |= _PAGE_NX;
+
+	/* return the copy of the PGD we want the kernel to use: */
+	return pgd;
+}
+
 /*
  * Initialize kernel page table isolation
  */

From 1c4de1ff4fe50453b968579ee86fac3da80dd783 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:38 +0100
Subject: [PATCH 116/305] x86/mm/pti: Allow NX poison to be set in p4d/pgd

With PAGE_TABLE_ISOLATION the user portion of the kernel page tables is
poisoned with the NX bit so if the entry code exits with the kernel page
tables selected in CR3, userspace crashes.

But doing so trips the p4d/pgd_bad() checks.  Make sure it does not do
that.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af38d93c4fbb..2d2d07300b4a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -846,7 +846,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 
 static inline int p4d_bad(p4d_t p4d)
 {
-	return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+	unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+
+	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		ignore_flags |= _PAGE_NX;
+
+	return (p4d_flags(p4d) & ~ignore_flags) != 0;
 }
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 
@@ -880,7 +885,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
 
 static inline int pgd_bad(pgd_t pgd)
 {
-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+	unsigned long ignore_flags = _PAGE_USER;
+
+	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		ignore_flags |= _PAGE_NX;
+
+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 
 static inline int pgd_none(pgd_t pgd)

From d9e9a6418065bb376e5de8d93ce346939b9a37a6 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:39 +0100
Subject: [PATCH 117/305] x86/mm/pti: Allocate a separate user PGD

Kernel page table isolation requires to have two PGDs. One for the kernel,
which contains the full kernel mapping plus the user space mapping and one
for user space which contains the user space mappings and the minimal set
of kernel mappings which are required by the architecture to be able to
transition from and to user space.

Add the necessary preliminaries.

[ tglx: Split out from the big kaiser dump. EFI fixup from Kirill ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgalloc.h | 11 +++++++++++
 arch/x86/kernel/head_64.S      | 30 +++++++++++++++++++++++++++---
 arch/x86/mm/pgtable.c          |  5 +++--
 arch/x86/platform/efi/efi_64.c |  5 ++++-
 4 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 4b5e1eafada7..aff42e1da6ee 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
  */
 extern gfp_t __userpte_alloc_gfp;
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Instead of one PGD, we acquire two PGDs.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
 /*
  * Allocate and free page tables.
  */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7dca675fe78d..04a625f0fcda 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
 	.balign	PAGE_SIZE; \
 GLOBAL(name)
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define PTI_USER_PGD_FILL	512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+	.balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define PTI_USER_PGD_FILL	0
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)			\
 	i = 0 ;						\
@@ -350,13 +371,14 @@ GLOBAL(name)
 	.endr
 
 	__INITDATA
-NEXT_PAGE(early_top_pgt)
+NEXT_PGD_PAGE(early_top_pgt)
 	.fill	511,8,0
 #ifdef CONFIG_X86_5LEVEL
 	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #else
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
+	.fill	PTI_USER_PGD_FILL,8,0
 
 NEXT_PAGE(early_dynamic_pgts)
 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
 	.data
 
 #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+	.fill	PTI_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_ident_pgt)
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
 	 */
 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 #else
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
 	.fill	512,8,0
+	.fill	PTI_USER_PGD_FILL,8,0
 #endif
 
 #ifdef CONFIG_X86_5LEVEL
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17ebc5a978cc..9b7bcbd33cc2 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
 		kmem_cache_free(pgd_cache, pgd);
 }
 #else
+
 static inline pgd_t *_pgd_alloc(void)
 {
-	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
-	free_page((unsigned long)pgd);
+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 }
 #endif /* CONFIG_X86_PAE */
 
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 20fb31579b69..39c4b35ac7a4 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -195,6 +195,9 @@ static pgd_t *efi_pgd;
  * because we want to avoid inserting EFI region mappings (EFI_VA_END
  * to EFI_VA_START) into the standard kernel page tables. Everything
  * else can be shared, see efi_sync_low_kernel_mappings().
+ *
+ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
+ * allocation.
  */
 int __init efi_alloc_page_tables(void)
 {
@@ -207,7 +210,7 @@ int __init efi_alloc_page_tables(void)
 		return 0;
 
 	gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
-	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+	efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
 	if (!efi_pgd)
 		return -ENOMEM;
 

From fc2fbc8512ed08d1de7720936fd7d2e4ce02c3a2 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:40 +0100
Subject: [PATCH 118/305] x86/mm/pti: Populate user PGD

In clone_pgd_range() copy the init user PGDs which cover the kernel half of
the address space, so a process has all the required kernel mappings
visible.

[ tglx: Split out from the big kaiser dump and folded Andys simplification ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2d2d07300b4a..cc6fa75884e9 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1119,7 +1119,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  */
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
-       memcpy(dst, src, count * sizeof(pgd_t));
+	memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+	/* Clone the user space pgd as well */
+	memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
+	       count * sizeof(pgd_t));
+#endif
 }
 
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)

From 03f4424f348e8be95eb1bbeba09461cd7b867828 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:42 +0100
Subject: [PATCH 119/305] x86/mm/pti: Add functions to clone kernel PMDs

Provide infrastructure to:

 - find a kernel PMD for a mapping which must be visible to user space for
   the entry/exit code to work.

 - walk an address range and share the kernel PMD with it.

This reuses a small part of the original KAISER patches to populate the
user space page table.

[ tglx: Made it universally usable so it can be used for any kind of shared
	mapping. Add a mechanism to clear specific bits in the user space
	visible PMD entry. Folded Andys simplifactions ]

Originally-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pti.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 69a983365392..d58bcee470fc 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -48,6 +48,11 @@
 #undef pr_fmt
 #define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
 
+/* Backporting helper */
+#ifndef __GFP_NOTRACK
+#define __GFP_NOTRACK	0
+#endif
+
 static void __init pti_print_if_insecure(const char *reason)
 {
 	if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
@@ -137,6 +142,128 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
 	return pgd;
 }
 
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a P4D on success, or NULL on failure.
+ */
+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+{
+	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+	if (address < PAGE_OFFSET) {
+		WARN_ONCE(1, "attempt to walk user address\n");
+		return NULL;
+	}
+
+	if (pgd_none(*pgd)) {
+		unsigned long new_p4d_page = __get_free_page(gfp);
+		if (!new_p4d_page)
+			return NULL;
+
+		if (pgd_none(*pgd)) {
+			set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
+			new_p4d_page = 0;
+		}
+		if (new_p4d_page)
+			free_page(new_p4d_page);
+	}
+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+	return p4d_offset(pgd, address);
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a PMD on success, or NULL on failure.
+ */
+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+{
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+	pud_t *pud;
+
+	BUILD_BUG_ON(p4d_large(*p4d) != 0);
+	if (p4d_none(*p4d)) {
+		unsigned long new_pud_page = __get_free_page(gfp);
+		if (!new_pud_page)
+			return NULL;
+
+		if (p4d_none(*p4d)) {
+			set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+			new_pud_page = 0;
+		}
+		if (new_pud_page)
+			free_page(new_pud_page);
+	}
+
+	pud = pud_offset(p4d, address);
+	/* The user page tables do not use large mappings: */
+	if (pud_large(*pud)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pud_none(*pud)) {
+		unsigned long new_pmd_page = __get_free_page(gfp);
+		if (!new_pmd_page)
+			return NULL;
+
+		if (pud_none(*pud)) {
+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+			new_pmd_page = 0;
+		}
+		if (new_pmd_page)
+			free_page(new_pmd_page);
+	}
+
+	return pmd_offset(pud, address);
+}
+
+static void __init
+pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+{
+	unsigned long addr;
+
+	/*
+	 * Clone the populated PMDs which cover start to end. These PMD areas
+	 * can have holes.
+	 */
+	for (addr = start; addr < end; addr += PMD_SIZE) {
+		pmd_t *pmd, *target_pmd;
+		pgd_t *pgd;
+		p4d_t *p4d;
+		pud_t *pud;
+
+		pgd = pgd_offset_k(addr);
+		if (WARN_ON(pgd_none(*pgd)))
+			return;
+		p4d = p4d_offset(pgd, addr);
+		if (WARN_ON(p4d_none(*p4d)))
+			return;
+		pud = pud_offset(p4d, addr);
+		if (pud_none(*pud))
+			continue;
+		pmd = pmd_offset(pud, addr);
+		if (pmd_none(*pmd))
+			continue;
+
+		target_pmd = pti_user_pagetable_walk_pmd(addr);
+		if (WARN_ON(!target_pmd))
+			return;
+
+		/*
+		 * Copy the PMD.  That is, the kernelmode and usermode
+		 * tables will share the last-level page tables of this
+		 * address range
+		 */
+		*target_pmd = pmd_clear_flags(*pmd, clear);
+	}
+}
+
 /*
  * Initialize kernel page table isolation
  */

From 8d4b067895791ab9fdb1aadfc505f64d71239dd2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:43 +0100
Subject: [PATCH 120/305] x86/mm/pti: Force entry through trampoline when PTI
 active

Force the entry through the trampoline only when PTI is active. Otherwise
go through the normal entry code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a9210f9b7cf8..f2a94dfb434e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1339,7 +1339,10 @@ void syscall_init(void)
 		(entry_SYSCALL_64_trampoline - _entry_trampoline);
 
 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-	wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+	if (static_cpu_has(X86_FEATURE_PTI))
+		wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+	else
+		wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
 #ifdef CONFIG_IA32_EMULATION
 	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);

From f7cfbee91559ca7e3e961a00ffac921208a115ad Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:45 +0100
Subject: [PATCH 121/305] x86/mm/pti: Share cpu_entry_area with user space page
 tables

Share the cpu entry area so the user space and kernel space page tables
have the same P4D page.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pti.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index d58bcee470fc..59290356f19f 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -264,6 +264,29 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
 	}
 }
 
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
+{
+	p4d_t *kernel_p4d, *user_p4d;
+	pgd_t *kernel_pgd;
+
+	user_p4d = pti_user_pagetable_walk_p4d(addr);
+	kernel_pgd = pgd_offset_k(addr);
+	kernel_p4d = p4d_offset(kernel_pgd, addr);
+	*user_p4d = *kernel_p4d;
+}
+
+/*
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ */
+static void __init pti_clone_user_shared(void)
+{
+	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+}
+
 /*
  * Initialize kernel page table isolation
  */
@@ -273,4 +296,6 @@ void __init pti_init(void)
 		return;
 
 	pr_info("enabled\n");
+
+	pti_clone_user_shared();
 }

From 2f7412ba9c6af5ab16bdbb4a3fdb1dcd2b4fd3c2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:46 +0100
Subject: [PATCH 122/305] x86/entry: Align entry text section to PMD boundary

The (irq)entry text must be visible in the user space page tables. To allow
simple PMD based sharing, make the entry text PMD aligned.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/vmlinux.lds.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d2a8b5a24a44..1e413a9326aa 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
 		. = ALIGN(HPAGE_SIZE);				\
 		__end_rodata_hpage_align = .;
 
+#define ALIGN_ENTRY_TEXT_BEGIN	. = ALIGN(PMD_SIZE);
+#define ALIGN_ENTRY_TEXT_END	. = ALIGN(PMD_SIZE);
+
 #else
 
 #define X64_ALIGN_RODATA_BEGIN
 #define X64_ALIGN_RODATA_END
 
+#define ALIGN_ENTRY_TEXT_BEGIN
+#define ALIGN_ENTRY_TEXT_END
+
 #endif
 
 PHDRS {
@@ -102,8 +108,10 @@ SECTIONS
 		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		ALIGN_ENTRY_TEXT_BEGIN
 		ENTRY_TEXT
 		IRQENTRY_TEXT
+		ALIGN_ENTRY_TEXT_END
 		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)

From 6dc72c3cbca0580642808d677181cad4c6433893 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:47 +0100
Subject: [PATCH 123/305] x86/mm/pti: Share entry text PMD

Share the entry text PMD of the kernel mapping with the user space
mapping. If large pages are enabled this is a single PMD entry and at the
point where it is copied into the user page table the RW bit has not been
cleared yet. Clear it right away so the user space visible map becomes RX.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pti.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 59290356f19f..0e78797650a7 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -287,6 +287,15 @@ static void __init pti_clone_user_shared(void)
 	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
 }
 
+/*
+ * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ */
+static void __init pti_clone_entry_text(void)
+{
+	pti_clone_pmds((unsigned long) __entry_text_start,
+			(unsigned long) __irqentry_text_end, _PAGE_RW);
+}
+
 /*
  * Initialize kernel page table isolation
  */
@@ -298,4 +307,5 @@ void __init pti_init(void)
 	pr_info("enabled\n");
 
 	pti_clone_user_shared();
+	pti_clone_entry_text();
 }

From 4b6bbe95b87966ba08999574db65c93c5e925a36 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 15 Dec 2017 22:08:18 +0100
Subject: [PATCH 124/305] x86/mm/pti: Map ESPFIX into user space

Map the ESPFIX pages into user space when PTI is enabled.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pti.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 0e78797650a7..b1c38ef9fbbb 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -287,6 +287,16 @@ static void __init pti_clone_user_shared(void)
 	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
 }
 
+/*
+ * Clone the ESPFIX P4D into the user space visinble page table
+ */
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+	pti_clone_p4d(ESPFIX_BASE_ADDR);
+#endif
+}
+
 /*
  * Clone the populated PMDs of the entry and irqentry text and force it RO.
  */
@@ -308,4 +318,5 @@ void __init pti_init(void)
 
 	pti_clone_user_shared();
 	pti_clone_entry_text();
+	pti_setup_espfix64();
 }

From 10043e02db7f8a4161f76434931051e7d797a5f6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:49 +0100
Subject: [PATCH 125/305] x86/cpu_entry_area: Add debugstore entries to
 cpu_entry_area

The Intel PEBS/BTS debug store is a design trainwreck as it expects virtual
addresses which must be visible in any execution context.

So it is required to make these mappings visible to user space when kernel
page table isolation is active.

Provide enough room for the buffer mappings in the cpu_entry_area so the
buffers are available in the user space visible page tables.

At the point where the kernel side entry area is populated there is no
buffer available yet, but the kernel PMD must be populated. To achieve this
set the entries for these buffers to non present.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/ds.c            |  5 ++--
 arch/x86/events/perf_event.h          | 21 ++--------------
 arch/x86/include/asm/cpu_entry_area.h | 13 ++++++++++
 arch/x86/include/asm/intel_ds.h       | 36 +++++++++++++++++++++++++++
 arch/x86/mm/cpu_entry_area.c          | 27 ++++++++++++++++++++
 5 files changed, 81 insertions(+), 21 deletions(-)
 create mode 100644 arch/x86/include/asm/intel_ds.h

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 3674a4b6f8bd..6522f0279cb8 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -8,11 +8,12 @@
 
 #include "../perf_event.h"
 
+/* Waste a full page so it can be mapped into the cpu_entry_area */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
 
-#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
 #define PEBS_FIXUP_SIZE		PAGE_SIZE
 
 /*
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f7aaadf9331f..373f9eda80b1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -14,6 +14,8 @@
 
 #include <linux/perf_event.h>
 
+#include <asm/intel_ds.h>
+
 /* To enable MSR tracing please use the generic trace points. */
 
 /*
@@ -77,8 +79,6 @@ struct amd_nb {
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
 
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS		8
 #define PEBS_COUNTER_MASK	((1ULL << MAX_PEBS_EVENTS) - 1)
 
 /*
@@ -95,23 +95,6 @@ struct amd_nb {
 	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
 	PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
 
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
 #define PEBS_REGS \
 	(PERF_REG_X86_AX | \
 	 PERF_REG_X86_BX | \
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 2fbc69a0916e..4a7884b8dca5 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -5,6 +5,7 @@
 
 #include <linux/percpu-defs.h>
 #include <asm/processor.h>
+#include <asm/intel_ds.h>
 
 /*
  * cpu_entry_area is a percpu region that contains things needed by the CPU
@@ -40,6 +41,18 @@ struct cpu_entry_area {
 	 */
 	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
 #endif
+#ifdef CONFIG_CPU_SUP_INTEL
+	/*
+	 * Per CPU debug store for Intel performance monitoring. Wastes a
+	 * full page at the moment.
+	 */
+	struct debug_store cpu_debug_store;
+	/*
+	 * The actual PEBS/BTS buffers must be mapped to user space
+	 * Reserve enough fixmap PTEs.
+	 */
+	struct debug_store_buffers cpu_debug_buffers;
+#endif
 };
 
 #define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area))
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
new file mode 100644
index 000000000000..62a9f4966b42
--- /dev/null
+++ b/arch/x86/include/asm/intel_ds.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_INTEL_DS_H
+#define _ASM_INTEL_DS_H
+
+#include <linux/percpu-defs.h>
+
+#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS		8
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+} __aligned(PAGE_SIZE);
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
+struct debug_store_buffers {
+	char	bts_buffer[BTS_BUFFER_SIZE];
+	char	pebs_buffer[PEBS_BUFFER_SIZE];
+};
+
+#endif
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index fe814fd5e014..b9283cc27622 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
 		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
 }
 
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+	int npages;
+	void *cea;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return;
+
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+	npages = sizeof(struct debug_store) / PAGE_SIZE;
+	BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+	cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+			     PAGE_KERNEL);
+
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+	/*
+	 * Force the population of PMDs for not yet allocated per cpu
+	 * memory like debug store buffers.
+	 */
+	npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+	for (; npages; npages--, cea += PAGE_SIZE)
+		cea_set_pte(cea, 0, PAGE_NONE);
+#endif
+}
+
 /* Setup the fixmap mappings only once per-processor */
 static void __init setup_cpu_entry_area(int cpu)
 {
@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
 	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
 		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
 #endif
+	percpu_setup_debug_store(cpu);
 }
 
 static __init void setup_cpu_entry_area_ptes(void)

From c1961a4631daef4aeabee8e368b1b13e8f173c91 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 4 Dec 2017 15:07:50 +0100
Subject: [PATCH 126/305] x86/events/intel/ds: Map debug buffers in
 cpu_entry_area

The BTS and PEBS buffers both have their virtual addresses programmed into
the hardware.  This means that any access to them is performed via the page
tables.  The times that the hardware accesses these are entirely dependent
on how the performance monitoring hardware events are set up.  In other
words, there is no way for the kernel to tell when the hardware might
access these buffers.

To avoid perf crashes, place 'debug_store' allocate pages and map them into
the cpu_entry_area.

The PEBS fixup buffer does not need this treatment.

[ tglx: Got rid of the kaiser_add_mapping() complication ]

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/ds.c   | 125 ++++++++++++++++++++++-------------
 arch/x86/events/perf_event.h |   2 +
 2 files changed, 82 insertions(+), 45 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 6522f0279cb8..8f0aace08b87 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,6 +3,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 
@@ -280,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
+{
+	phys_addr_t pa;
+	size_t msz = 0;
+
+	pa = virt_to_phys(addr);
+	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+		cea_set_pte(cea, pa, prot);
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+	size_t msz = 0;
+
+	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+		cea_set_pte(cea, 0, PAGE_NONE);
+}
+
+static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
+{
+	unsigned int order = get_order(size);
+	int node = cpu_to_node(cpu);
+	struct page *page;
+
+	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+	return page ? page_address(page) : NULL;
+}
+
+static void dsfree_pages(const void *buffer, size_t size)
+{
+	if (buffer)
+		free_pages((unsigned long)buffer, get_order(size));
+}
+
 static int alloc_pebs_buffer(int cpu)
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	int node = cpu_to_node(cpu);
-	int max;
-	void *buffer, *ibuffer;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	size_t bsiz = x86_pmu.pebs_buffer_size;
+	int max, node = cpu_to_node(cpu);
+	void *buffer, *ibuffer, *cea;
 
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+	buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -301,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		if (!ibuffer) {
-			kfree(buffer);
+			dsfree_pages(buffer, bsiz);
 			return -ENOMEM;
 		}
 		per_cpu(insn_buffer, cpu) = ibuffer;
 	}
-
-	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
-
-	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+	hwev->ds_pebs_vaddr = buffer;
+	/* Update the cpu entry area mapping */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+	ds->pebs_buffer_base = (unsigned long) cea;
+	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
 	ds->pebs_index = ds->pebs_buffer_base;
-	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-		max * x86_pmu.pebs_record_size;
-
+	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
+	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
 	return 0;
 }
 
 static void release_pebs_buffer(int cpu)
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *cea;
 
 	if (!ds || !x86_pmu.pebs)
 		return;
@@ -327,73 +365,70 @@ static void release_pebs_buffer(int cpu)
 	kfree(per_cpu(insn_buffer, cpu));
 	per_cpu(insn_buffer, cpu) = NULL;
 
-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	/* Clear the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
 	ds->pebs_buffer_base = 0;
+	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
+	hwev->ds_pebs_vaddr = NULL;
 }
 
 static int alloc_bts_buffer(int cpu)
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	int node = cpu_to_node(cpu);
-	int max, thresh;
-	void *buffer;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *buffer, *cea;
+	int max;
 
 	if (!x86_pmu.bts)
 		return 0;
 
-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+	buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
 	if (unlikely(!buffer)) {
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		return -ENOMEM;
 	}
-
-	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-	thresh = max / 16;
-
-	ds->bts_buffer_base = (u64)(unsigned long)buffer;
+	hwev->ds_bts_vaddr = buffer;
+	/* Update the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+	ds->bts_buffer_base = (unsigned long) cea;
+	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
 	ds->bts_index = ds->bts_buffer_base;
-	ds->bts_absolute_maximum = ds->bts_buffer_base +
-		max * BTS_RECORD_SIZE;
-	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-		thresh * BTS_RECORD_SIZE;
-
+	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
+	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
 	return 0;
 }
 
 static void release_bts_buffer(int cpu)
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *cea;
 
 	if (!ds || !x86_pmu.bts)
 		return;
 
-	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	/* Clear the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+	ds_clear_cea(cea, BTS_BUFFER_SIZE);
 	ds->bts_buffer_base = 0;
+	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
+	hwev->ds_bts_vaddr = NULL;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
-	int node = cpu_to_node(cpu);
-	struct debug_store *ds;
-
-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-	if (unlikely(!ds))
-		return -ENOMEM;
+	struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
 
+	memset(ds, 0, sizeof(*ds));
 	per_cpu(cpu_hw_events, cpu).ds = ds;
-
 	return 0;
 }
 
 static void release_ds_buffer(int cpu)
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
-	kfree(ds);
 }
 
 void release_ds_buffers(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 373f9eda80b1..8e4ea143ed96 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -199,6 +199,8 @@ struct cpu_hw_events {
 	 * Intel DebugStore bits
 	 */
 	struct debug_store	*ds;
+	void			*ds_pebs_vaddr;
+	void			*ds_bts_vaddr;
 	u64			pebs_enabled;
 	int			n_pebs;
 	int			n_large_pebs;

From 9f449772a3106bcdd4eb8fdeb281147b0e99fb30 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Dec 2017 07:56:44 -0800
Subject: [PATCH 127/305] x86/mm/64: Make a full PGD-entry size hole in the
 memory map

Shrink vmalloc space from 16384TiB to 12800TiB to enlarge the hole starting
at 0xff90000000000000 to be a full PGD entry.

A subsequent patch will use this hole for the pagetable isolation LDT
alias.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/x86_64/mm.txt         | 4 ++--
 arch/x86/include/asm/pgtable_64_types.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 51101708a03a..496a1dbf139d 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -29,8 +29,8 @@ Virtual memory map with 5 level page tables:
 hole caused by [56:63] sign extension
 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff91ffffffffffff (=49 bits) hole
-ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
+ff90000000000000 - ff9fffffffffffff (=52 bits) hole
+ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ... unused hole ...
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 3d27831bc58d..83e9489ae944 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -79,8 +79,8 @@ typedef struct { pteval_t pte; } pte_t;
 #define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 
 #ifdef CONFIG_X86_5LEVEL
-# define VMALLOC_SIZE_TB	_AC(16384, UL)
-# define __VMALLOC_BASE		_AC(0xff92000000000000, UL)
+# define VMALLOC_SIZE_TB	_AC(12800, UL)
+# define __VMALLOC_BASE		_AC(0xffa0000000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL)
 #else
 # define VMALLOC_SIZE_TB	_AC(32, UL)

From f55f0501cbf65ec41cca5058513031b711730b1d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Dec 2017 07:56:45 -0800
Subject: [PATCH 128/305] x86/pti: Put the LDT in its own PGD if PTI is on

With PTI enabled, the LDT must be mapped in the usermode tables somewhere.
The LDT is per process, i.e. per mm.

An earlier approach mapped the LDT on context switch into a fixmap area,
but that's a big overhead and exhausted the fixmap space when NR_CPUS got
big.

Take advantage of the fact that there is an address space hole which
provides a completely unused pgd. Use this pgd to manage per-mm LDT
mappings.

This has a down side: the LDT isn't (currently) randomized, and an attack
that can write the LDT is instant root due to call gates (thanks, AMD, for
leaving call gates in AMD64 but designing them wrong so they're only useful
for exploits).  This can be mitigated by making the LDT read-only or
randomizing the mapping, either of which is strightforward on top of this
patch.

This will significantly slow down LDT users, but that shouldn't matter for
important workloads -- the LDT is only used by DOSEMU(2), Wine, and very
old libc implementations.

[ tglx: Cleaned it up. ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/x86_64/mm.txt         |   3 +-
 arch/x86/include/asm/mmu_context.h      |  59 +++++++++-
 arch/x86/include/asm/pgtable_64_types.h |   4 +
 arch/x86/include/asm/processor.h        |  23 ++--
 arch/x86/kernel/ldt.c                   | 139 +++++++++++++++++++++++-
 arch/x86/mm/dump_pagetables.c           |   9 ++
 6 files changed, 220 insertions(+), 17 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 496a1dbf139d..ad41b3813f0a 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
 fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
@@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables:
 hole caused by [56:63] sign extension
 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff9fffffffffffff (=52 bits) hole
+ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
 ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5ede7cae1d67..c931b88982a0 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -50,10 +50,33 @@ struct ldt_struct {
 	 * call gates.  On native, we could merge the ldt_struct and LDT
 	 * allocations, but it's not worth trying to optimize.
 	 */
-	struct desc_struct *entries;
-	unsigned int nr_entries;
+	struct desc_struct	*entries;
+	unsigned int		nr_entries;
+
+	/*
+	 * If PTI is in use, then the entries array is not mapped while we're
+	 * in user mode.  The whole array will be aliased at the addressed
+	 * given by ldt_slot_va(slot).  We use two slots so that we can allocate
+	 * and map, and enable a new LDT without invalidating the mapping
+	 * of an older, still-in-use LDT.
+	 *
+	 * slot will be -1 if this LDT doesn't have an alias mapping.
+	 */
+	int			slot;
 };
 
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static inline void *ldt_slot_va(int slot)
+{
+#ifdef CONFIG_X86_64
+	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+#else
+	BUG();
+#endif
+}
+
 /*
  * Used for LDT copy/destruction.
  */
@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
 }
 int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
 void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
 static inline void init_new_context_ldt(struct mm_struct *mm) { }
 static inline int ldt_dup_context(struct mm_struct *oldmm,
@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
 {
 	return 0;
 }
-static inline void destroy_context_ldt(struct mm_struct *mm) {}
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
 #endif
 
 static inline void load_mm_ldt(struct mm_struct *mm)
@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 	 * that we can see.
 	 */
 
-	if (unlikely(ldt))
-		set_ldt(ldt->entries, ldt->nr_entries);
-	else
+	if (unlikely(ldt)) {
+		if (static_cpu_has(X86_FEATURE_PTI)) {
+			if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+				/*
+				 * Whoops -- either the new LDT isn't mapped
+				 * (if slot == -1) or is mapped into a bogus
+				 * slot (if slot > 1).
+				 */
+				clear_LDT();
+				return;
+			}
+
+			/*
+			 * If page table isolation is enabled, ldt->entries
+			 * will not be mapped in the userspace pagetables.
+			 * Tell the CPU to access the LDT through the alias
+			 * at ldt_slot_va(ldt->slot).
+			 */
+			set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+		} else {
+			set_ldt(ldt->entries, ldt->nr_entries);
+		}
+	} else {
 		clear_LDT();
+	}
 #else
 	clear_LDT();
 #endif
@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 	paravirt_arch_exit_mmap(mm);
+	ldt_arch_exit_mmap(mm);
 }
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 83e9489ae944..b97a539bcdee 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -82,10 +82,14 @@ typedef struct { pteval_t pte; } pte_t;
 # define VMALLOC_SIZE_TB	_AC(12800, UL)
 # define __VMALLOC_BASE		_AC(0xffa0000000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL)
+# define LDT_PGD_ENTRY		_AC(-112, UL)
+# define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 #else
 # define VMALLOC_SIZE_TB	_AC(32, UL)
 # define __VMALLOC_BASE		_AC(0xffffc90000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL)
+# define LDT_PGD_ENTRY		_AC(-4, UL)
+# define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 #endif
 
 #ifdef CONFIG_RANDOMIZE_MEMORY
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9e482d8b0b97..9c18da64daa9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
 
 #else
 /*
- * User space process size. 47bits minus one guard page.  The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously.  We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size.  This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen.  This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
  */
 #define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a6b5d62f45a7..9629c5d8267a 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -24,6 +24,7 @@
 #include <linux/uaccess.h>
 
 #include <asm/ldt.h>
+#include <asm/tlb.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
 static void flush_ldt(void *__mm)
 {
 	struct mm_struct *mm = __mm;
-	mm_context_t *pc;
 
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
 		return;
 
-	pc = &mm->context;
-	set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+	load_mm_ldt(mm);
 
 	refresh_ldt_segments();
 }
@@ -94,10 +93,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
 		return NULL;
 	}
 
+	/* The new LDT isn't aliased for PTI yet. */
+	new_ldt->slot = -1;
+
 	new_ldt->nr_entries = num_entries;
 	return new_ldt;
 }
 
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ *
+ * There is no corresponding unmap function.  Even if the LDT is freed, we
+ * leave the PTEs around until the slot is reused or the mm is destroyed.
+ * This is harmless: the LDT is always in ordinary memory, and no one will
+ * access the freed slot.
+ *
+ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
+ * it useful, and the flush would slow down modify_ldt().
+ */
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	bool is_vmalloc, had_top_level_entry;
+	unsigned long va;
+	spinlock_t *ptl;
+	pgd_t *pgd;
+	int i;
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return 0;
+
+	/*
+	 * Any given ldt_struct should have map_ldt_struct() called at most
+	 * once.
+	 */
+	WARN_ON(ldt->slot != -1);
+
+	/*
+	 * Did we already have the top level entry allocated?  We can't
+	 * use pgd_none() for this because it doens't do anything on
+	 * 4-level page table kernels.
+	 */
+	pgd = pgd_offset(mm, LDT_BASE_ADDR);
+	had_top_level_entry = (pgd->pgd != 0);
+
+	is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+	for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+		unsigned long offset = i << PAGE_SHIFT;
+		const void *src = (char *)ldt->entries + offset;
+		unsigned long pfn;
+		pte_t pte, *ptep;
+
+		va = (unsigned long)ldt_slot_va(slot) + offset;
+		pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+			page_to_pfn(virt_to_page(src));
+		/*
+		 * Treat the PTI LDT range as a *userspace* range.
+		 * get_locked_pte() will allocate all needed pagetables
+		 * and account for them in this mm.
+		 */
+		ptep = get_locked_pte(mm, va, &ptl);
+		if (!ptep)
+			return -ENOMEM;
+		pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
+		set_pte_at(mm, va, ptep, pte);
+		pte_unmap_unlock(ptep, ptl);
+	}
+
+	if (mm->context.ldt) {
+		/*
+		 * We already had an LDT.  The top-level entry should already
+		 * have been allocated and synchronized with the usermode
+		 * tables.
+		 */
+		WARN_ON(!had_top_level_entry);
+		if (static_cpu_has(X86_FEATURE_PTI))
+			WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
+	} else {
+		/*
+		 * This is the first time we're mapping an LDT for this process.
+		 * Sync the pgd to the usermode tables.
+		 */
+		WARN_ON(had_top_level_entry);
+		if (static_cpu_has(X86_FEATURE_PTI)) {
+			WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
+			set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+		}
+	}
+
+	va = (unsigned long)ldt_slot_va(slot);
+	flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
+
+	ldt->slot = slot;
+#endif
+	return 0;
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	struct mmu_gather tlb;
+	unsigned long start = LDT_BASE_ADDR;
+	unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	tlb_gather_mmu(&tlb, mm, start, end);
+	free_pgd_range(&tlb, start, end, start, end);
+	tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
 /* After calling this, the LDT is immutable. */
 static void finalize_ldt_struct(struct ldt_struct *ldt)
 {
@@ -156,6 +266,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
 	       new_ldt->nr_entries * LDT_ENTRY_SIZE);
 	finalize_ldt_struct(new_ldt);
 
+	retval = map_ldt_struct(mm, new_ldt, 0);
+	if (retval) {
+		free_ldt_pgtables(mm);
+		free_ldt_struct(new_ldt);
+		goto out_unlock;
+	}
 	mm->context.ldt = new_ldt;
 
 out_unlock:
@@ -174,6 +290,11 @@ void destroy_context_ldt(struct mm_struct *mm)
 	mm->context.ldt = NULL;
 }
 
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+	free_ldt_pgtables(mm);
+}
+
 static int read_ldt(void __user *ptr, unsigned long bytecount)
 {
 	struct mm_struct *mm = current->mm;
@@ -287,6 +408,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 	new_ldt->entries[ldt_info.entry_number] = ldt;
 	finalize_ldt_struct(new_ldt);
 
+	/*
+	 * If we are using PTI, map the new LDT into the userspace pagetables.
+	 * If there is already an LDT, use the other slot so that other CPUs
+	 * will continue to use the old LDT until install_ldt() switches
+	 * them over to the new LDT.
+	 */
+	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+	if (error) {
+		free_ldt_struct(old_ldt);
+		goto out_unlock;
+	}
+
 	install_ldt(mm, new_ldt);
 	free_ldt_struct(old_ldt);
 	error = 0;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 43dedbfb7257..690eaf31ca34 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -52,11 +52,17 @@ enum address_markers_idx {
 	USER_SPACE_NR = 0,
 	KERNEL_SPACE_NR,
 	LOW_KERNEL_NR,
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+	LDT_NR,
+#endif
 	VMALLOC_START_NR,
 	VMEMMAP_START_NR,
 #ifdef CONFIG_KASAN
 	KASAN_SHADOW_START_NR,
 	KASAN_SHADOW_END_NR,
+#endif
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
+	LDT_NR,
 #endif
 	CPU_ENTRY_AREA_NR,
 #ifdef CONFIG_X86_ESPFIX64
@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
 #ifdef CONFIG_KASAN
 	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" },
 	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" },
+#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+	[LDT_NR]		= { LDT_BASE_ADDR,	"LDT remap" },
 #endif
 	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
 #ifdef CONFIG_X86_ESPFIX64

From 85900ea51577e31b186e523c8f4e068c79ecc7d3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Dec 2017 07:56:42 -0800
Subject: [PATCH 129/305] x86/pti: Map the vsyscall page if needed

Make VSYSCALLs work fully in PTI mode by mapping them properly to the user
space visible page tables.

[ tglx: Hide unused functions (Patch by Arnd Bergmann) ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vsyscall/vsyscall_64.c |  6 +--
 arch/x86/include/asm/vsyscall.h       |  1 +
 arch/x86/mm/pti.c                     | 65 +++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 1faf40f2dda9..577fa8adb785 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
  * vsyscalls but leave the page not present.  If so, we skip calling
  * this.
  */
-static void __init set_vsyscall_pgtable_user_bits(void)
+void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 
-	pgd = pgd_offset_k(VSYSCALL_ADDR);
+	pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
 	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
 	p4d = p4d_offset(pgd, VSYSCALL_ADDR);
 #if CONFIG_PGTABLE_LEVELS >= 5
@@ -373,7 +373,7 @@ void __init map_vsyscall(void)
 			     vsyscall_mode == NATIVE
 			     ? PAGE_KERNEL_VSYSCALL
 			     : PAGE_KERNEL_VVAR);
-		set_vsyscall_pgtable_user_bits();
+		set_vsyscall_pgtable_user_bits(swapper_pg_dir);
 	}
 
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d9a7c659009c..b986b2ca688a 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -7,6 +7,7 @@
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
 extern void map_vsyscall(void);
+extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
 
 /*
  * Called on instruction fetch fault in vsyscall page.
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index b1c38ef9fbbb..bce8aea65606 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -38,6 +38,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/hypervisor.h>
+#include <asm/vsyscall.h>
 #include <asm/cmdline.h>
 #include <asm/pti.h>
 #include <asm/pgtable.h>
@@ -223,6 +224,69 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 	return pmd_offset(pud, address);
 }
 
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+/*
+ * Walk the shadow copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.  Does not support large pages.
+ *
+ * Note: this is only used when mapping *new* kernel data into the
+ * user/shadow page tables.  It is never used for userspace data.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+{
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+	pte_t *pte;
+
+	/* We can't do anything sensible if we hit a large mapping. */
+	if (pmd_large(*pmd)) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	if (pmd_none(*pmd)) {
+		unsigned long new_pte_page = __get_free_page(gfp);
+		if (!new_pte_page)
+			return NULL;
+
+		if (pmd_none(*pmd)) {
+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+			new_pte_page = 0;
+		}
+		if (new_pte_page)
+			free_page(new_pte_page);
+	}
+
+	pte = pte_offset_kernel(pmd, address);
+	if (pte_flags(*pte) & _PAGE_USER) {
+		WARN_ONCE(1, "attempt to walk to user pte\n");
+		return NULL;
+	}
+	return pte;
+}
+
+static void __init pti_setup_vsyscall(void)
+{
+	pte_t *pte, *target_pte;
+	unsigned int level;
+
+	pte = lookup_address(VSYSCALL_ADDR, &level);
+	if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
+		return;
+
+	target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+	if (WARN_ON(!target_pte))
+		return;
+
+	*target_pte = *pte;
+	set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
+}
+#else
+static void __init pti_setup_vsyscall(void) { }
+#endif
+
 static void __init
 pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
 {
@@ -319,4 +383,5 @@ void __init pti_init(void)
 	pti_clone_user_shared();
 	pti_clone_entry_text();
 	pti_setup_espfix64();
+	pti_setup_vsyscall();
 }

From 2ea907c4fe7b78e5840c1dc07800eae93248cad1 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:57 +0100
Subject: [PATCH 130/305] x86/mm: Allow flushing for future ASID switches

If changing the page tables in such a way that an invalidation of all
contexts (aka. PCIDs / ASIDs) is required, they can be actively invalidated
by:

 1. INVPCID for each PCID (works for single pages too).

 2. Load CR3 with each PCID without the NOFLUSH bit set

 3. Load CR3 with the NOFLUSH bit set for each and do INVLPG for each address.

But, none of these are really feasible since there are ~6 ASIDs (12 with
PAGE_TABLE_ISOLATION) at the time that invalidation is required.
Instead of actively invalidating them, invalidate the *current* context and
also mark the cpu_tlbstate _quickly_ to indicate future invalidation to be
required.

At the next context-switch, look for this indicator
('invalidate_other' being set) invalidate all of the
cpu_tlbstate.ctxs[] entries.

This ensures that any future context switches will do a full flush
of the TLB, picking up the previous changes.

[ tglx: Folded more fixups from Peter ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 37 ++++++++++++++++++++++++++-------
 arch/x86/mm/tlb.c               | 35 +++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 171b429f43a2..490a706fdba8 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -134,6 +134,17 @@ struct tlb_state {
 	 */
 	bool is_lazy;
 
+	/*
+	 * If set we changed the page tables in such a way that we
+	 * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
+	 * This tells us to go invalidate all the non-loaded ctxs[]
+	 * on the next context switch.
+	 *
+	 * The current ctx was kept up-to-date as it ran and does not
+	 * need to be invalidated.
+	 */
+	bool invalidate_other;
+
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
 	 * disabling interrupts when modifying either one.
@@ -211,6 +222,14 @@ static inline unsigned long cr4_read_shadow(void)
 	return this_cpu_read(cpu_tlbstate.cr4);
 }
 
+/*
+ * Mark all other ASIDs as invalid, preserves the current.
+ */
+static inline void invalidate_other_asid(void)
+{
+	this_cpu_write(cpu_tlbstate.invalidate_other, true);
+}
+
 /*
  * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
  * enable and PPro Global page enable), so that any CPU's that boot
@@ -298,14 +317,6 @@ static inline void __flush_tlb_all(void)
 		 */
 		__flush_tlb();
 	}
-
-	/*
-	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
-	 * we'd end up flushing kernel translations for the current ASID but
-	 * we might fail to flush kernel translations for other cached ASIDs.
-	 *
-	 * To avoid this issue, we force PCID off if PGE is off.
-	 */
 }
 
 /*
@@ -315,6 +326,16 @@ static inline void __flush_tlb_one(unsigned long addr)
 {
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
 	__flush_tlb_single(addr);
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	/*
+	 * __flush_tlb_single() will have cleared the TLB entry for this ASID,
+	 * but since kernel space is replicated across all, we must also
+	 * invalidate all others.
+	 */
+	invalidate_other_asid();
 }
 
 #define TLB_FLUSH_ALL	-1UL
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0a1be3adc97e..254c9eb79fe5 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,38 @@
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
+/*
+ * We get here when we do something requiring a TLB invalidation
+ * but could not go invalidate all of the contexts.  We do the
+ * necessary invalidation by clearing out the 'ctx_id' which
+ * forces a TLB flush when the context is loaded.
+ */
+void clear_asid_other(void)
+{
+	u16 asid;
+
+	/*
+	 * This is only expected to be set if we have disabled
+	 * kernel _PAGE_GLOBAL pages.
+	 */
+	if (!static_cpu_has(X86_FEATURE_PTI)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+		/* Do not need to flush the current asid */
+		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
+			continue;
+		/*
+		 * Make sure the next time we go to switch to
+		 * this asid, we do a flush:
+		 */
+		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
+	}
+	this_cpu_write(cpu_tlbstate.invalidate_other, false);
+}
+
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
 
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 		return;
 	}
 
+	if (this_cpu_read(cpu_tlbstate.invalidate_other))
+		clear_asid_other();
+
 	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
 		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
 		    next->context.ctx_id)

From 48e111982cda033fec832c6b0592c2acedd85d04 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:58 +0100
Subject: [PATCH 131/305] x86/mm: Abstract switching CR3

In preparation to adding additional PCID flushing, abstract the
loading of a new ASID into CR3.

[ PeterZ: Split out from big combo patch ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/tlb.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 254c9eb79fe5..42a8875f73fe 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -100,6 +100,24 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 	*need_flush = true;
 }
 
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+{
+	unsigned long new_mm_cr3;
+
+	if (need_flush) {
+		new_mm_cr3 = build_cr3(pgdir, new_asid);
+	} else {
+		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+	}
+
+	/*
+	 * Caution: many callers of this function expect
+	 * that load_cr3() is serializing and orders TLB
+	 * fills with respect to the mm_cpumask writes.
+	 */
+	write_cr3(new_mm_cr3);
+}
+
 void leave_mm(int cpu)
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -230,7 +248,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-			write_cr3(build_cr3(next->pgd, new_asid));
+			load_new_mm_cr3(next->pgd, new_asid, true);
 
 			/*
 			 * NB: This gets called via leave_mm() in the idle path
@@ -243,7 +261,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 		} else {
 			/* The new ASID is already up to date. */
-			write_cr3(build_cr3_noflush(next->pgd, new_asid));
+			load_new_mm_cr3(next->pgd, new_asid, false);
 
 			/* See above wrt _rcuidle. */
 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);

From 6fd166aae78c0ab738d49bda653cbd9e3b1491cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 4 Dec 2017 15:07:59 +0100
Subject: [PATCH 132/305] x86/mm: Use/Fix PCID to optimize user/kernel switches

We can use PCID to retain the TLBs across CR3 switches; including those now
part of the user/kernel switch. This increases performance of kernel
entry/exit at the cost of more expensive/complicated TLB flushing.

Now that we have two address spaces, one for kernel and one for user space,
we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID
(just like we use the PFN LSB for the PGD). Since we do TLB invalidation
from kernel space, the existing code will only invalidate the kernel PCID,
we augment that by marking the corresponding user PCID invalid, and upon
switching back to userspace, use a flushing CR3 write for the switch.

In order to access the user_pcid_flush_mask we use PER_CPU storage, which
means the previously established SWAPGS vs CR3 ordering is now mandatory
and required.

Having to do this memory access does require additional registers, most
sites have a functioning stack and we can spill one (RAX), sites without
functional stack need to otherwise provide the second scratch register.

Note: PCID is generally available on Intel Sandybridge and later CPUs.
Note: Up until this point TLB flushing was broken in this series.

Based-on-code-from: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h                    | 72 +++++++++++++---
 arch/x86/entry/entry_64.S                   |  9 +-
 arch/x86/entry/entry_64_compat.S            |  4 +-
 arch/x86/include/asm/processor-flags.h      |  5 ++
 arch/x86/include/asm/tlbflush.h             | 91 ++++++++++++++++++---
 arch/x86/include/uapi/asm/processor-flags.h |  7 +-
 arch/x86/kernel/asm-offsets.c               |  4 +
 arch/x86/mm/init.c                          |  2 +-
 arch/x86/mm/tlb.c                           |  1 +
 9 files changed, 162 insertions(+), 33 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3d3389a92c33..7894e5c0eef7 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -3,6 +3,9 @@
 #include <asm/unwind_hints.h>
 #include <asm/cpufeatures.h>
 #include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
 
 /*
 
@@ -191,17 +194,21 @@ For 32-bit we have the following conventions - kernel is built with
 
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
 
-/* PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two halves: */
-#define PTI_SWITCH_MASK (1<<PAGE_SHIFT)
+/*
+ * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_SWITCH_PGTABLES_MASK	(1<<PAGE_SHIFT)
+#define PTI_SWITCH_MASK		(PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
 
-.macro ADJUST_KERNEL_CR3 reg:req
-	/* Clear "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
-	andq	$(~PTI_SWITCH_MASK), \reg
+.macro SET_NOFLUSH_BIT	reg:req
+	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
 .endm
 
-.macro ADJUST_USER_CR3 reg:req
-	/* Move CR3 up a page to the user page tables: */
-	orq	$(PTI_SWITCH_MASK), \reg
+.macro ADJUST_KERNEL_CR3 reg:req
+	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
+	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+	andq    $(~PTI_SWITCH_MASK), \reg
 .endm
 
 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
@@ -212,21 +219,58 @@ For 32-bit we have the following conventions - kernel is built with
 .Lend_\@:
 .endm
 
-.macro SWITCH_TO_USER_CR3 scratch_reg:req
+#define THIS_CPU_user_pcid_flush_mask   \
+	PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
 	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 	mov	%cr3, \scratch_reg
-	ADJUST_USER_CR3 \scratch_reg
+
+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+	/*
+	 * Test if the ASID needs a flush.
+	 */
+	movq	\scratch_reg, \scratch_reg2
+	andq	$(0x7FF), \scratch_reg		/* mask ASID */
+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jnc	.Lnoflush_\@
+
+	/* Flush needed, clear the bit */
+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	movq	\scratch_reg2, \scratch_reg
+	jmp	.Lwrcr3_\@
+
+.Lnoflush_\@:
+	movq	\scratch_reg2, \scratch_reg
+	SET_NOFLUSH_BIT \scratch_reg
+
+.Lwrcr3_\@:
+	/* Flip the PGD and ASID to the user version */
+	orq     $(PTI_SWITCH_MASK), \scratch_reg
 	mov	\scratch_reg, %cr3
 .Lend_\@:
 .endm
 
+.macro SWITCH_TO_USER_CR3_STACK	scratch_reg:req
+	pushq	%rax
+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+	popq	%rax
+.endm
+
 .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
 	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
 	movq	%cr3, \scratch_reg
 	movq	\scratch_reg, \save_reg
 	/*
-	 * Is the switch bit zero?  This means the address is
-	 * up in real PAGE_TABLE_ISOLATION patches in a moment.
+	 * Is the "switch mask" all zero?  That means that both of
+	 * these are zero:
+	 *
+	 *	1. The user/kernel PCID bit, and
+	 *	2. The user/kernel "bit" that points CR3 to the
+	 *	   bottom half of the 8k PGD
+	 *
+	 * That indicates a kernel CR3 value, not a user CR3.
 	 */
 	testq	$(PTI_SWITCH_MASK), \scratch_reg
 	jz	.Ldone_\@
@@ -251,7 +295,9 @@ For 32-bit we have the following conventions - kernel is built with
 
 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
 .endm
-.macro SWITCH_TO_USER_CR3 scratch_reg:req
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+.endm
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
 .endm
 .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
 .endm
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 2ad7ad4d3dd6..fd501844af1f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -23,7 +23,6 @@
 #include <asm/segment.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
-#include "calling.h"
 #include <asm/asm-offsets.h>
 #include <asm/msr.h>
 #include <asm/unistd.h>
@@ -40,6 +39,8 @@
 #include <asm/frame.h>
 #include <linux/err.h>
 
+#include "calling.h"
+
 .code64
 .section .entry.text, "ax"
 
@@ -406,7 +407,7 @@ syscall_return_via_sysret:
 	 * We are on the trampoline stack.  All regs except RDI are live.
 	 * We can do future final exit work right here.
 	 */
-	SWITCH_TO_USER_CR3 scratch_reg=%rdi
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
 
 	popq	%rdi
 	popq	%rsp
@@ -744,7 +745,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 	 * We can do future final exit work right here.
 	 */
 
-	SWITCH_TO_USER_CR3 scratch_reg=%rdi
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
 
 	/* Restore RDI. */
 	popq	%rdi
@@ -857,7 +858,7 @@ native_irq_return_ldt:
 	 */
 	orq	PER_CPU_VAR(espfix_stack), %rax
 
-	SWITCH_TO_USER_CR3 scratch_reg=%rdi	/* to user CR3 */
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
 	SWAPGS					/* to user GS */
 	popq	%rdi				/* Restore user RDI */
 
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 05238b29895e..40f17009ec20 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -275,9 +275,9 @@ sysret32_from_system_call:
 	 * switch until after after the last reference to the process
 	 * stack.
 	 *
-	 * %r8 is zeroed before the sysret, thus safe to clobber.
+	 * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
 	 */
-	SWITCH_TO_USER_CR3 scratch_reg=%r8
+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
 
 	xorq	%r8, %r8
 	xorq	%r9, %r9
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 43212a43ee69..6a60fea90b9d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -38,6 +38,11 @@
 #define CR3_ADDR_MASK	__sme_clr(0x7FFFFFFFFFFFF000ull)
 #define CR3_PCID_MASK	0xFFFull
 #define CR3_NOFLUSH	BIT_ULL(63)
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define X86_CR3_PTI_SWITCH_BIT	11
+#endif
+
 #else
 /*
  * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 490a706fdba8..5dcc38b16604 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,6 +10,8 @@
 #include <asm/special_insns.h>
 #include <asm/smp.h>
 #include <asm/invpcid.h>
+#include <asm/pti.h>
+#include <asm/processor-flags.h>
 
 static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 {
@@ -24,24 +26,54 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 
 /* There are 12 bits of space for ASIDS in CR3 */
 #define CR3_HW_ASID_BITS		12
+
 /*
  * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
  * user/kernel switches
  */
-#define PTI_CONSUMED_ASID_BITS		0
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS	1
+#else
+# define PTI_CONSUMED_PCID_BITS	0
+#endif
+
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
 
-#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
 /*
  * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
  * for them being zero-based.  Another -1 is because ASID 0 is reserved for
  * use by non-PCID-aware users.
  */
-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in two cache
+ * lines.
+ */
+#define TLB_NR_DYN_ASIDS	6
 
 static inline u16 kern_pcid(u16 asid)
 {
 	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/*
+	 * Make sure that the dynamic ASID space does not confict with the
+	 * bit we are using to switch between user and kernel ASIDs.
+	 */
+	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
+
+	/*
+	 * The ASID being passed in here should have respected the
+	 * MAX_ASID_AVAILABLE and thus never have the switch bit set.
+	 */
+	VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
+#endif
+	/*
+	 * The dynamically-assigned ASIDs that get passed in are small
+	 * (<TLB_NR_DYN_ASIDS).  They never have the high switch bit set,
+	 * so do not bother to clear it.
+	 *
 	 * If PCID is on, ASID-aware code paths put the ASID+1 into the
 	 * PCID bits.  This serves two purposes.  It prevents a nasty
 	 * situation in which PCID-unaware code saves CR3, loads some other
@@ -95,12 +127,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
 	return !static_cpu_has(X86_FEATURE_PCID);
 }
 
-/*
- * 6 because 6 should be plenty and struct tlb_state will fit in
- * two cache lines.
- */
-#define TLB_NR_DYN_ASIDS 6
-
 struct tlb_context {
 	u64 ctx_id;
 	u64 tlb_gen;
@@ -145,6 +171,13 @@ struct tlb_state {
 	 */
 	bool invalidate_other;
 
+	/*
+	 * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
+	 * the corresponding user PCID needs a flush next time we
+	 * switch to it; see SWITCH_TO_USER_CR3.
+	 */
+	unsigned short user_pcid_flush_mask;
+
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
 	 * disabling interrupts when modifying either one.
@@ -249,15 +282,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 
 extern void initialize_tlbstate_and_flush(void);
 
+/*
+ * Given an ASID, flush the corresponding user ASID.  We can delay this
+ * until the next time we switch to it.
+ *
+ * See SWITCH_TO_USER_CR3.
+ */
+static inline void invalidate_user_asid(u16 asid)
+{
+	/* There is no user ASID if address space separation is off */
+	if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		return;
+
+	/*
+	 * We only have a single ASID if PCID is off and the CR3
+	 * write will have flushed it.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_PCID))
+		return;
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	__set_bit(kern_pcid(asid),
+		  (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+}
+
 /*
  * flush the entire current user mapping
  */
 static inline void __native_flush_tlb(void)
 {
+	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 	/*
-	 * If current->mm == NULL then we borrow a mm which may change during a
-	 * task switch and therefore we must not be preempted while we write CR3
-	 * back:
+	 * If current->mm == NULL then we borrow a mm which may change
+	 * during a task switch and therefore we must not be preempted
+	 * while we write CR3 back:
 	 */
 	preempt_disable();
 	native_write_cr3(__native_read_cr3());
@@ -301,7 +361,14 @@ static inline void __native_flush_tlb_global(void)
  */
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
+	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
 	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	invalidate_user_asid(loaded_mm_asid);
 }
 
 /*
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 53b4ca55ebb6..97abdaab9535 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -78,7 +78,12 @@
 #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)
 #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */
 #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK	_AC(0x00000fff,UL) /* PCID Mask */
+
+#define X86_CR3_PCID_BITS	12
+#define X86_CR3_PCID_MASK	(_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
+
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
 
 /*
  * Intel CPU features in CR4
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 676b7cf4b62b..76417a9aab73 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -17,6 +17,7 @@
 #include <asm/sigframe.h>
 #include <asm/bootparam.h>
 #include <asm/suspend.h>
+#include <asm/tlbflush.h>
 
 #ifdef CONFIG_XEN
 #include <xen/interface/xen.h>
@@ -94,6 +95,9 @@ void common(void) {
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
 
+	/* TLB state for the entry code */
+	OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
+
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
 	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index af75069fb116..caeb8a7bf0a4 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -855,7 +855,7 @@ void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.loaded_mm = &init_mm,
 	.next_asid = 1,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 42a8875f73fe..a1561957dccb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -105,6 +105,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
 	unsigned long new_mm_cr3;
 
 	if (need_flush) {
+		invalidate_user_asid(new_asid);
 		new_mm_cr3 = build_cr3(pgdir, new_asid);
 	} else {
 		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);

From 21e94459110252d41b45c0c8ba50fd72a664d50c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 4 Dec 2017 15:08:00 +0100
Subject: [PATCH 133/305] x86/mm: Optimize RESTORE_CR3

Most NMI/paranoid exceptions will not in fact change pagetables and would
thus not require TLB flushing, however RESTORE_CR3 uses flushing CR3
writes.

Restores to kernel PCIDs can be NOFLUSH, because we explicitly flush the
kernel mappings and now that we track which user PCIDs need flushing we can
avoid those too when possible.

This does mean RESTORE_CR3 needs an additional scratch_reg, luckily both
sites have plenty available.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h  | 30 ++++++++++++++++++++++++++++--
 arch/x86/entry/entry_64.S |  4 ++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 7894e5c0eef7..45a63e00a6af 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -281,8 +281,34 @@ For 32-bit we have the following conventions - kernel is built with
 .Ldone_\@:
 .endm
 
-.macro RESTORE_CR3 save_reg:req
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
 	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+	/*
+	 * KERNEL pages can always resume with NOFLUSH as we do
+	 * explicit flushes.
+	 */
+	bt	$X86_CR3_PTI_SWITCH_BIT, \save_reg
+	jnc	.Lnoflush_\@
+
+	/*
+	 * Check if there's a pending flush for the user ASID we're
+	 * about to set.
+	 */
+	movq	\save_reg, \scratch_reg
+	andq	$(0x7FF), \scratch_reg
+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jnc	.Lnoflush_\@
+
+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jmp	.Lwrcr3_\@
+
+.Lnoflush_\@:
+	SET_NOFLUSH_BIT \save_reg
+
+.Lwrcr3_\@:
 	/*
 	 * The CR3 write could be avoided when not changing its value,
 	 * but would require a CR3 read *and* a scratch register.
@@ -301,7 +327,7 @@ For 32-bit we have the following conventions - kernel is built with
 .endm
 .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
 .endm
-.macro RESTORE_CR3 save_reg:req
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
 .endm
 
 #endif
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index fd501844af1f..ed31d00dc5ee 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1288,7 +1288,7 @@ ENTRY(paranoid_exit)
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	.Lparanoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
-	RESTORE_CR3	save_reg=%r14
+	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
 	SWAPGS_UNSAFE_STACK
 	jmp	.Lparanoid_exit_restore
 .Lparanoid_exit_no_swapgs:
@@ -1730,7 +1730,7 @@ end_repeat_nmi:
 	movq	$-1, %rsi
 	call	do_nmi
 
-	RESTORE_CR3 save_reg=%r14
+	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
 
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore

From 6cff64b86aaaa07f89f50498055a20e45754b0c1 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:08:01 +0100
Subject: [PATCH 134/305] x86/mm: Use INVPCID for __native_flush_tlb_single()

This uses INVPCID to shoot down individual lines of the user mapping
instead of marking the entire user map as invalid. This
could/might/possibly be faster.

This for sure needs tlb_single_page_flush_ceiling to be redetermined;
esp. since INVPCID is _slow_.

A detailed performance analysis is available here:

  https://lkml.kernel.org/r/3062e486-3539-8a1f-5724-16199420be71@intel.com

[ Peterz: Split out from big combo patch ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/tlbflush.h    | 23 ++++++++++-
 arch/x86/mm/init.c                 | 64 +++++++++++++++++-------------
 3 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d8ec834ea884..07cdd1715705 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -197,6 +197,7 @@
 #define X86_FEATURE_CAT_L3		( 7*32+ 4) /* Cache Allocation Technology L3 */
 #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
 #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_INVPCID_SINGLE	( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
 
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 5dcc38b16604..57072a1052fe 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -85,6 +85,18 @@ static inline u16 kern_pcid(u16 asid)
 	return asid + 1;
 }
 
+/*
+ * The user PCID is just the kernel one, plus the "switch bit".
+ */
+static inline u16 user_pcid(u16 asid)
+{
+	u16 ret = kern_pcid(asid);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
+#endif
+	return ret;
+}
+
 struct pgd_t;
 static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
 {
@@ -335,6 +347,8 @@ static inline void __native_flush_tlb_global(void)
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
 		 * to CR4 sandwiched inside an IRQ flag save/restore.
+		 *
+		 * Note, this works with CR4.PCIDE=0 or 1.
 		 */
 		invpcid_flush_all();
 		return;
@@ -368,7 +382,14 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	if (!static_cpu_has(X86_FEATURE_PTI))
 		return;
 
-	invalidate_user_asid(loaded_mm_asid);
+	/*
+	 * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
+	 * Just use invalidate_user_asid() in case we are called early.
+	 */
+	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
+		invalidate_user_asid(loaded_mm_asid);
+	else
+		invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
 }
 
 /*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index caeb8a7bf0a4..80259ad8c386 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -203,34 +203,44 @@ static void __init probe_page_size_mask(void)
 
 static void setup_pcid(void)
 {
-#ifdef CONFIG_X86_64
-	if (boot_cpu_has(X86_FEATURE_PCID)) {
-		if (boot_cpu_has(X86_FEATURE_PGE)) {
-			/*
-			 * This can't be cr4_set_bits_and_update_boot() --
-			 * the trampoline code can't handle CR4.PCIDE and
-			 * it wouldn't do any good anyway.  Despite the name,
-			 * cr4_set_bits_and_update_boot() doesn't actually
-			 * cause the bits in question to remain set all the
-			 * way through the secondary boot asm.
-			 *
-			 * Instead, we brute-force it and set CR4.PCIDE
-			 * manually in start_secondary().
-			 */
-			cr4_set_bits(X86_CR4_PCIDE);
-		} else {
-			/*
-			 * flush_tlb_all(), as currently implemented, won't
-			 * work if PCID is on but PGE is not.  Since that
-			 * combination doesn't exist on real hardware, there's
-			 * no reason to try to fully support it, but it's
-			 * polite to avoid corrupting data if we're on
-			 * an improperly configured VM.
-			 */
-			setup_clear_cpu_cap(X86_FEATURE_PCID);
-		}
+	if (!IS_ENABLED(CONFIG_X86_64))
+		return;
+
+	if (!boot_cpu_has(X86_FEATURE_PCID))
+		return;
+
+	if (boot_cpu_has(X86_FEATURE_PGE)) {
+		/*
+		 * This can't be cr4_set_bits_and_update_boot() -- the
+		 * trampoline code can't handle CR4.PCIDE and it wouldn't
+		 * do any good anyway.  Despite the name,
+		 * cr4_set_bits_and_update_boot() doesn't actually cause
+		 * the bits in question to remain set all the way through
+		 * the secondary boot asm.
+		 *
+		 * Instead, we brute-force it and set CR4.PCIDE manually in
+		 * start_secondary().
+		 */
+		cr4_set_bits(X86_CR4_PCIDE);
+
+		/*
+		 * INVPCID's single-context modes (2/3) only work if we set
+		 * X86_CR4_PCIDE, *and* we INVPCID support.  It's unusable
+		 * on systems that have X86_CR4_PCIDE clear, or that have
+		 * no INVPCID support at all.
+		 */
+		if (boot_cpu_has(X86_FEATURE_INVPCID))
+			setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+	} else {
+		/*
+		 * flush_tlb_all(), as currently implemented, won't work if
+		 * PCID is on but PGE is not.  Since that combination
+		 * doesn't exist on real hardware, there's no reason to try
+		 * to fully support it, but it's polite to avoid corrupting
+		 * data if we're on an improperly configured VM.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_PCID);
 	}
-#endif
 }
 
 #ifdef CONFIG_X86_32

From 0a126abd576ebc6403f063dbe20cf7416c9d9393 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Dec 2017 13:34:53 +0100
Subject: [PATCH 135/305] x86/mm: Clarify the whole ASID/kernel PCID/user PCID
 naming

Ideally we'd also use sparse to enforce this separation so it becomes much
more difficult to mess up.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 55 ++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 57072a1052fe..b519da4fc03c 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -13,16 +13,33 @@
 #include <asm/pti.h>
 #include <asm/processor-flags.h>
 
-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
-{
-	/*
-	 * Bump the generation count.  This also serves as a full barrier
-	 * that synchronizes with switch_mm(): callers are required to order
-	 * their read of mm_cpumask after their writes to the paging
-	 * structures.
-	 */
-	return atomic64_inc_return(&mm->context.tlb_gen);
-}
+/*
+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
+ * to what is traditionally called ASID on the RISC processors.
+ *
+ * We don't use the traditional ASID implementation, where each process/mm gets
+ * its own ASID and flush/restart when we run out of ASID space.
+ *
+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
+ * that came by on this CPU, allowing cheaper switch_mm between processes on
+ * this CPU.
+ *
+ * We end up with different spaces for different things. To avoid confusion we
+ * use different names for each of them:
+ *
+ * ASID  - [0, TLB_NR_DYN_ASIDS-1]
+ *         the canonical identifier for an mm
+ *
+ * kPCID - [1, TLB_NR_DYN_ASIDS]
+ *         the value we write into the PCID part of CR3; corresponds to the
+ *         ASID+1, because PCID 0 is special.
+ *
+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ *         for KPTI each mm has two address spaces and thus needs two
+ *         PCID values, but we can still do with a single ASID denomination
+ *         for each mm. Corresponds to kPCID + 2048.
+ *
+ */
 
 /* There are 12 bits of space for ASIDS in CR3 */
 #define CR3_HW_ASID_BITS		12
@@ -41,7 +58,7 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 
 /*
  * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
- * for them being zero-based.  Another -1 is because ASID 0 is reserved for
+ * for them being zero-based.  Another -1 is because PCID 0 is reserved for
  * use by non-PCID-aware users.
  */
 #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
@@ -52,6 +69,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
  */
 #define TLB_NR_DYN_ASIDS	6
 
+/*
+ * Given @asid, compute kPCID
+ */
 static inline u16 kern_pcid(u16 asid)
 {
 	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
@@ -86,7 +106,7 @@ static inline u16 kern_pcid(u16 asid)
 }
 
 /*
- * The user PCID is just the kernel one, plus the "switch bit".
+ * Given @asid, compute uPCID
  */
 static inline u16 user_pcid(u16 asid)
 {
@@ -484,6 +504,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 void native_flush_tlb_others(const struct cpumask *cpumask,
 			     const struct flush_tlb_info *info);
 
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+	/*
+	 * Bump the generation count.  This also serves as a full barrier
+	 * that synchronizes with switch_mm(): callers are required to order
+	 * their read of mm_cpumask after their writes to the paging
+	 * structures.
+	 */
+	return atomic64_inc_return(&mm->context.tlb_gen);
+}
+
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
 					struct mm_struct *mm)
 {

From 5f26d76c3fd67c48806415ef8b1116c97beff8ba Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 19 Dec 2017 22:33:46 +0100
Subject: [PATCH 136/305] x86/dumpstack: Indicate in Oops whether PTI is
 configured and enabled

CONFIG_PAGE_TABLE_ISOLATION is relatively new and intrusive feature that may
still have some corner cases which could take some time to manifest and be
fixed. It would be useful to have Oops messages indicate whether it was
enabled for building the kernel, and whether it was disabled during boot.

Example of fully enabled:

	Oops: 0001 [#1] SMP PTI

Example of enabled during build, but disabled during boot:

	Oops: 0001 [#1] SMP NOPTI

We can decide to remove this after the feature has been tested in the field
long enough.

[ tglx: Made it use boot_cpu_has() as requested by Borislav ]

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: bpetkov@suse.de
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: jkosina@suse.cz
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/dumpstack.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 36b17e0febe8..5fa110699ed2 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
 	unsigned long sp;
 #endif
 	printk(KERN_DEFAULT
-	       "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
+	       "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
 	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",
 	       IS_ENABLED(CONFIG_SMP)     ? " SMP"             : "",
 	       debug_pagealloc_enabled()  ? " DEBUG_PAGEALLOC" : "",
-	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "");
+	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "",
+	       IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
+	       (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
 
 	if (notify_die(DIE_OOPS, str, regs, err,
 			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)

From 385ce0ea4c078517fa51c261882c4e72fba53005 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:08:03 +0100
Subject: [PATCH 137/305] x86/mm/pti: Add Kconfig

Finally allow CONFIG_PAGE_TABLE_ISOLATION to be enabled.

PARAVIRT generally requires that the kernel not manage its own page tables.
It also means that the hypervisor and kernel must agree wholeheartedly
about what format the page tables are in and what they contain.
PAGE_TABLE_ISOLATION, unfortunately, changes the rules and they
can not be used together.

I've seen conflicting feedback from maintainers lately about whether they
want the Kconfig magic to go first or last in a patch series.  It's going
last here because the partially-applied series leads to kernels that can
not boot in a bunch of cases.  I did a run through the entire series with
CONFIG_PAGE_TABLE_ISOLATION=y to look for build errors, though.

[ tglx: Removed SMP and !PARAVIRT dependencies as they not longer exist ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 security/Kconfig | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/security/Kconfig b/security/Kconfig
index e8e449444e65..a623d13bf288 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,16 @@ config SECURITY_NETWORK
 	  implement socket and networking access controls.
 	  If you are unsure how to answer this question, answer N.
 
+config PAGE_TABLE_ISOLATION
+	bool "Remove the kernel mapping in user mode"
+	depends on X86_64 && !UML
+	help
+	  This feature reduces the number of hardware side channels by
+	  ensuring that the majority of kernel addresses are not mapped
+	  into userspace.
+
+	  See Documentation/x86/pagetable-isolation.txt for more details.
+
 config SECURITY_INFINIBAND
 	bool "Infiniband Security Hooks"
 	depends on SECURITY && INFINIBAND

From 75298aa179d56cd64f54e58a19fffc8ab922b4c0 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 4 Dec 2017 15:08:04 +0100
Subject: [PATCH 138/305] x86/mm/dump_pagetables: Add page table directory to
 the debugfs VFS hierarchy

The upcoming support for dumping the kernel and the user space page tables
of the current process would create more random files in the top level
debugfs directory.

Add a page table directory and move the existing file to it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/debug_pagetables.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index bfcffdf6c577..d1449fb6dc7a 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -22,21 +22,26 @@ static const struct file_operations ptdump_fops = {
 	.release	= single_release,
 };
 
-static struct dentry *pe;
+static struct dentry *dir, *pe;
 
 static int __init pt_dump_debug_init(void)
 {
-	pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
-				 &ptdump_fops);
-	if (!pe)
+	dir = debugfs_create_dir("page_tables", NULL);
+	if (!dir)
 		return -ENOMEM;
 
+	pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops);
+	if (!pe)
+		goto err;
 	return 0;
+err:
+	debugfs_remove_recursive(dir);
+	return -ENOMEM;
 }
 
 static void __exit pt_dump_debug_exit(void)
 {
-	debugfs_remove_recursive(pe);
+	debugfs_remove_recursive(dir);
 }
 
 module_init(pt_dump_debug_init);

From b4bf4f924b1d7bade38fd51b2e401d20d0956e4d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:08:05 +0100
Subject: [PATCH 139/305] x86/mm/dump_pagetables: Check user space page table
 for WX pages

ptdump_walk_pgd_level_checkwx() checks the kernel page table for WX pages,
but does not check the PAGE_TABLE_ISOLATION user space page table.

Restructure the code so that dmesg output is selected by an explicit
argument and not implicit via checking the pgd argument for !NULL.

Add the check for the user space page table.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h |  1 +
 arch/x86/mm/debug_pagetables.c |  2 +-
 arch/x86/mm/dump_pagetables.c  | 30 +++++++++++++++++++++++++-----
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index cc6fa75884e9..03780d5c41c5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level_checkwx(void);
 
 #ifdef CONFIG_DEBUG_WX
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index d1449fb6dc7a..8e70c1599e51 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -5,7 +5,7 @@
 
 static int ptdump_show(struct seq_file *m, void *v)
 {
-	ptdump_walk_pgd_level(m, NULL);
+	ptdump_walk_pgd_level_debugfs(m, NULL);
 	return 0;
 }
 
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 690eaf31ca34..17f5b417f95e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -476,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
 }
 
 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
-				       bool checkwx)
+				       bool checkwx, bool dmesg)
 {
 #ifdef CONFIG_X86_64
 	pgd_t *start = (pgd_t *) &init_top_pgt;
@@ -489,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
 	if (pgd) {
 		start = pgd;
-		st.to_dmesg = true;
+		st.to_dmesg = dmesg;
 	}
 
 	st.check_wx = checkwx;
@@ -527,13 +527,33 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 {
-	ptdump_walk_pgd_level_core(m, pgd, false);
+	ptdump_walk_pgd_level_core(m, pgd, false, true);
+}
+
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd)
+{
+	ptdump_walk_pgd_level_core(m, pgd, false, false);
+}
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
+
+static void ptdump_walk_user_pgd_level_checkwx(void)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	pgd_t *pgd = (pgd_t *) &init_top_pgt;
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	pr_info("x86/mm: Checking user space page tables\n");
+	pgd = kernel_to_user_pgdp(pgd);
+	ptdump_walk_pgd_level_core(NULL, pgd, true, false);
+#endif
 }
-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
 
 void ptdump_walk_pgd_level_checkwx(void)
 {
-	ptdump_walk_pgd_level_core(NULL, NULL, true);
+	ptdump_walk_pgd_level_core(NULL, NULL, true, false);
+	ptdump_walk_user_pgd_level_checkwx();
 }
 
 static int __init pt_dump_init(void)

From a4b51ef6552c704764684cef7e753162dc87c5fa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:08:06 +0100
Subject: [PATCH 140/305] x86/mm/dump_pagetables: Allow dumping current
 pagetables

Add two debugfs files which allow to dump the pagetable of the current
task.

current_kernel dumps the regular page table. This is the page table which
is normally shared between kernel and user space. If kernel page table
isolation is enabled this is the kernel space mapping.

If kernel page table isolation is enabled the second file, current_user,
dumps the user space page table.

These files allow to verify the resulting page tables for page table
isolation, but even in the normal case its useful to be able to inspect
user space page tables of current for debugging purposes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h |  2 +-
 arch/x86/mm/debug_pagetables.c | 71 ++++++++++++++++++++++++++++++++--
 arch/x86/mm/dump_pagetables.c  |  6 ++-
 3 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 03780d5c41c5..6b43d677f8ca 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,7 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
 void ptdump_walk_pgd_level_checkwx(void);
 
 #ifdef CONFIG_DEBUG_WX
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index 8e70c1599e51..421f2664ffa0 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -5,7 +5,7 @@
 
 static int ptdump_show(struct seq_file *m, void *v)
 {
-	ptdump_walk_pgd_level_debugfs(m, NULL);
+	ptdump_walk_pgd_level_debugfs(m, NULL, false);
 	return 0;
 }
 
@@ -22,7 +22,57 @@ static const struct file_operations ptdump_fops = {
 	.release	= single_release,
 };
 
-static struct dentry *dir, *pe;
+static int ptdump_show_curknl(struct seq_file *m, void *v)
+{
+	if (current->mm->pgd) {
+		down_read(&current->mm->mmap_sem);
+		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
+		up_read(&current->mm->mmap_sem);
+	}
+	return 0;
+}
+
+static int ptdump_open_curknl(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show_curknl, NULL);
+}
+
+static const struct file_operations ptdump_curknl_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ptdump_open_curknl,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static struct dentry *pe_curusr;
+
+static int ptdump_show_curusr(struct seq_file *m, void *v)
+{
+	if (current->mm->pgd) {
+		down_read(&current->mm->mmap_sem);
+		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
+		up_read(&current->mm->mmap_sem);
+	}
+	return 0;
+}
+
+static int ptdump_open_curusr(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show_curusr, NULL);
+}
+
+static const struct file_operations ptdump_curusr_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ptdump_open_curusr,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+static struct dentry *dir, *pe_knl, *pe_curknl;
 
 static int __init pt_dump_debug_init(void)
 {
@@ -30,9 +80,22 @@ static int __init pt_dump_debug_init(void)
 	if (!dir)
 		return -ENOMEM;
 
-	pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops);
-	if (!pe)
+	pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
+				     &ptdump_fops);
+	if (!pe_knl)
 		goto err;
+
+	pe_curknl = debugfs_create_file("current_kernel", 0400,
+					dir, NULL, &ptdump_curknl_fops);
+	if (!pe_curknl)
+		goto err;
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	pe_curusr = debugfs_create_file("current_user", 0400,
+					dir, NULL, &ptdump_curusr_fops);
+	if (!pe_curusr)
+		goto err;
+#endif
 	return 0;
 err:
 	debugfs_remove_recursive(dir);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 17f5b417f95e..f56902c1f04b 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -530,8 +530,12 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 	ptdump_walk_pgd_level_core(m, pgd, false, true);
 }
 
-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd)
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
 {
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	if (user && static_cpu_has(X86_FEATURE_PTI))
+		pgd = kernel_to_user_pgdp(pgd);
+#endif
 	ptdump_walk_pgd_level_core(m, pgd, false, false);
 }
 EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);

From 9f5cb6b32d9e0a3a7453222baaf15664d92adbf2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Dec 2017 20:35:11 +0100
Subject: [PATCH 141/305] x86/ldt: Make the LDT mapping RO

Now that the LDT mapping is in a known area when PAGE_TABLE_ISOLATION is
enabled its a primary target for attacks, if a user space interface fails
to validate a write address correctly. That can never happen, right?

The SDM states:

    If the segment descriptors in the GDT or an LDT are placed in ROM, the
    processor can enter an indefinite loop if software or the processor
    attempts to update (write to) the ROM-based segment descriptors. To
    prevent this problem, set the accessed bits for all segment descriptors
    placed in a ROM. Also, remove operating-system or executive code that
    attempts to modify segment descriptors located in ROM.

So its a valid approach to set the ACCESS bit when setting up the LDT entry
and to map the table RO. Fixup the selftest so it can handle that new mode.

Remove the manual ACCESS bit setter in set_tls_desc() as this is now
pointless. Folded the patch from Peter Ziljstra.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/desc.h           |  2 ++
 arch/x86/kernel/ldt.c                 |  7 ++++++-
 arch/x86/kernel/tls.c                 | 11 ++---------
 tools/testing/selftests/x86/ldt_gdt.c |  3 +--
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index bc359dd2f7f6..85e23bb7b34e 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 
 	desc->type		= (info->read_exec_only ^ 1) << 1;
 	desc->type	       |= info->contents << 2;
+	/* Set the ACCESS bit so it can be mapped RO */
+	desc->type	       |= 1;
 
 	desc->s			= 1;
 	desc->dpl		= 0x3;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 9629c5d8267a..579cc4a66fdf 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -158,7 +158,12 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
 		ptep = get_locked_pte(mm, va, &ptl);
 		if (!ptep)
 			return -ENOMEM;
-		pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
+		/*
+		 * Map it RO so the easy to find address is not a primary
+		 * target via some kernel interface which misses a
+		 * permission check.
+		 */
+		pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
 		set_pte_at(mm, va, ptep, pte);
 		pte_unmap_unlock(ptep, ptl);
 	}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9a9c9b076955..a5b802a12212 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
 	cpu = get_cpu();
 
 	while (n-- > 0) {
-		if (LDT_empty(info) || LDT_zero(info)) {
+		if (LDT_empty(info) || LDT_zero(info))
 			memset(desc, 0, sizeof(*desc));
-		} else {
+		else
 			fill_ldt(desc, info);
-
-			/*
-			 * Always set the accessed bit so that the CPU
-			 * doesn't try to write to the (read-only) GDT.
-			 */
-			desc->type |= 1;
-		}
 		++info;
 		++desc;
 	}
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index 0304ffb714f2..1aef72df20a1 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt,
 	 * NB: Different Linux versions do different things with the
 	 * accessed bit in set_thread_area().
 	 */
-	if (ar != expected_ar &&
-	    (ldt || ar != (expected_ar | AR_ACCESSED))) {
+	if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
 		printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
 		       (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
 		nerrs++;

From c0ee554906c3d6554fbddf95ae664cd9f817082b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 22 Dec 2017 12:37:43 -0600
Subject: [PATCH 142/305] pid: Handle failure to allocate the first pid in a
 pid namespace

With the replacement of the pid bitmap and hashtable with an idr in
alloc_pid started occassionally failing when allocating the first pid
in a pid namespace.  Things were not completely reset resulting in
the first allocated pid getting the number 2 (not 1).  Which
further resulted in ns->proc_mnt not getting set and eventually
causing an oops in proc_flush_task.

Oops: 0000 [#1] SMP
CPU: 2 PID: 6743 Comm: trinity-c117 Not tainted 4.15.0-rc4-think+ #2
RIP: 0010:proc_flush_task+0x8e/0x1b0
RSP: 0018:ffffc9000bbffc40 EFLAGS: 00010286
RAX: 0000000000000001 RBX: 0000000000000001 RCX: 00000000fffffffb
RDX: 0000000000000000 RSI: ffffc9000bbffc50 RDI: 0000000000000000
RBP: ffffc9000bbffc63 R08: 0000000000000000 R09: 0000000000000002
R10: ffffc9000bbffb70 R11: ffffc9000bbffc64 R12: 0000000000000003
R13: 0000000000000000 R14: 0000000000000003 R15: ffff8804c10d7840
FS:  00007f7cb8965700(0000) GS:ffff88050a200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 00000003e21ae003 CR4: 00000000001606e0
DR0: 00007fb1d6c22000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
Call Trace:
 ? release_task+0xaf/0x680
 release_task+0xd2/0x680
 ? wait_consider_task+0xb82/0xce0
 wait_consider_task+0xbe9/0xce0
 ? do_wait+0xe1/0x330
 do_wait+0x151/0x330
 kernel_wait4+0x8d/0x150
 ? task_stopped_code+0x50/0x50
 SYSC_wait4+0x95/0xa0
 ? rcu_read_lock_sched_held+0x6c/0x80
 ? syscall_trace_enter+0x2d7/0x340
 ? do_syscall_64+0x60/0x210
 do_syscall_64+0x60/0x210
 entry_SYSCALL64_slow_path+0x25/0x25
RIP: 0033:0x7f7cb82603aa
RSP: 002b:00007ffd60770bc8 EFLAGS: 00000246
 ORIG_RAX: 000000000000003d
RAX: ffffffffffffffda RBX: 00007f7cb6cd4000 RCX: 00007f7cb82603aa
RDX: 000000000000000b RSI: 00007ffd60770bd0 RDI: 0000000000007cca
RBP: 0000000000007cca R08: 00007f7cb8965700 R09: 00007ffd607c7080
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ffd60770bd0 R14: 00007f7cb6cd4058 R15: 00000000cccccccd
Code: c1 e2 04 44 8b 60 30 48 8b 40 38 44 8b 34 11 48 c7 c2 60 3a f5 81 44 89 e1 4c 8b 68 58 e8 4b b4 77 00 89 44 24 14 48 8d 74 24 10 <49> 8b 7d 00 e8 b9 6a f9 ff 48 85 c0 74 1a 48 89 c7 48 89 44 24
RIP: proc_flush_task+0x8e/0x1b0 RSP: ffffc9000bbffc40
CR2: 0000000000000000
---[ end trace 53d67a6481059862 ]---

Improve the quality of the implementation by resetting the place to
start allocating pids on failure to allocate the first pid.

As improving the quality of the implementation is the goal remove the now
unnecesarry disable_pid_allocations call when we fail to mount proc.

Fixes: 95846ecf9dac ("pid: replace pid bitmap implementation with IDR API")
Fixes: 8ef047aaaeb8 ("pid namespaces: make alloc_pid(), free_pid() and put_pid() work with struct upid")
Reported-by: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/pid.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index b13b624e2c49..1e8bb6550ec4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -193,10 +193,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	}
 
 	if (unlikely(is_child_reaper(pid))) {
-		if (pid_ns_prepare_proc(ns)) {
-			disable_pid_allocation(ns);
+		if (pid_ns_prepare_proc(ns))
 			goto out_free;
-		}
 	}
 
 	get_pid_ns(ns);
@@ -226,6 +224,10 @@ out_free:
 	while (++i <= ns->level)
 		idr_remove(&ns->idr, (pid->numbers + i)->nr);
 
+	/* On failure to allocate the first pid, reset the state */
+	if (ns->pid_allocated == PIDNS_ADDING)
+		idr_set_cursor(&ns->idr, 0);
+
 	spin_unlock_irq(&pidmap_lock);
 
 	kmem_cache_free(ns->pid_cachep, pid);

From 8bea728dce8972e534e6b99fd550f7b5cc3864e8 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 25 Dec 2017 11:34:54 +0800
Subject: [PATCH 143/305] netfilter: nf_tables: fix potential NULL-ptr deref in
 nf_tables_dump_obj_done()

If there is no NFTA_OBJ_TABLE and NFTA_OBJ_TYPE, the c.data will be NULL in
nf_tables_getobj(). So before free filter->table in nf_tables_dump_obj_done(),
we need to check if filter is NULL first.

Fixes: e46abbcc05aa ("netfilter: nf_tables: Allow table names of up to 255 chars")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8d4526651661..07bd4138c84e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4665,8 +4665,10 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb)
 {
 	struct nft_obj_filter *filter = cb->data;
 
-	kfree(filter->table);
-	kfree(filter);
+	if (filter) {
+		kfree(filter->table);
+		kfree(filter);
+	}
 
 	return 0;
 }

From c6a36ad383559a60a249aa6016cebf3cb8b6c485 Mon Sep 17 00:00:00 2001
From: Max Schulze <max.schulze@posteo.de>
Date: Wed, 20 Dec 2017 20:47:44 +0100
Subject: [PATCH 144/305] USB: serial: ftdi_sio: add id for Airbus DS P8GR

Add AIRBUS_DS_P8GR device IDs to ftdi_sio driver.

Signed-off-by: Max Schulze <max.schulze@posteo.de>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/ftdi_sio.c     | 1 +
 drivers/usb/serial/ftdi_sio_ids.h | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 1aba9105b369..fc68952c994a 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1013,6 +1013,7 @@ static const struct usb_device_id id_table_combined[] = {
 		.driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
 	{ USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) },
 	{ USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) },
+	{ USB_DEVICE(AIRBUS_DS_VID, AIRBUS_DS_P8GR) },
 	{ }					/* Terminating entry */
 };
 
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index 4faa09fe308c..8b4ecd2bd297 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -914,6 +914,12 @@
 #define ICPDAS_I7561U_PID		0x0104
 #define ICPDAS_I7563U_PID		0x0105
 
+/*
+ * Airbus Defence and Space
+ */
+#define AIRBUS_DS_VID			0x1e8e  /* Vendor ID */
+#define AIRBUS_DS_P8GR			0x6001  /* Tetra P8GR */
+
 /*
  * RT Systems programming cables for various ham radios
  */

From 052f71e25a7ecd80a9567b291df8ea333d9a8565 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Thu, 21 Dec 2017 15:06:13 +0200
Subject: [PATCH 145/305] xhci: Fix xhci debugfs NULL pointer dereference in
 resume from hibernate

Free the virt_device and its debugfs_private member together.

When resuming from hibernate the .free_dev callback unconditionally
freed the debugfs_private member, but could leave virt_device intact.

This triggered a NULL pointer dereference after resume when usbmuxd
sent a USBDEVFS_SETCONFIGURATION ioctl to a device, trying to add a
endpoint debugfs entry to a already freed debugfs_private pointer.

Fixes: 02b6fdc2a153 ("usb: xhci: Add debugfs interface for xHCI driver")
Reported-by: Alexander Kappner <agk@godking.net>
Tested-by: Alexander Kappner <agk@godking.net>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 2424d3020ca3..da6dbe3ebd8b 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -3525,8 +3525,6 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
 	struct xhci_slot_ctx *slot_ctx;
 	int i, ret;
 
-	xhci_debugfs_remove_slot(xhci, udev->slot_id);
-
 #ifndef CONFIG_USB_DEFAULT_PERSIST
 	/*
 	 * We called pm_runtime_get_noresume when the device was attached.
@@ -3555,8 +3553,10 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
 	}
 
 	ret = xhci_disable_slot(xhci, udev->slot_id);
-	if (ret)
+	if (ret) {
+		xhci_debugfs_remove_slot(xhci, udev->slot_id);
 		xhci_free_virt_device(xhci, udev->slot_id);
+	}
 }
 
 int xhci_disable_slot(struct xhci_hcd *xhci, u32 slot_id)

From dde634057da71a3505d7a6c0b77bb24ded6728c8 Mon Sep 17 00:00:00 2001
From: Alexander Kappner <agk@godking.net>
Date: Thu, 21 Dec 2017 15:06:14 +0200
Subject: [PATCH 146/305] xhci: Fix use-after-free in xhci debugfs

Trying to read from debugfs after the system has resumed from
hibernate causes a use-after-free and thus a protection fault.

Steps to reproduce:
Hibernate system, resume from hibernate, then run
$ cat /sys/kernel/debug/usb/xhci/*/command-ring/enqueue

[ 3902.765086] general protection fault: 0000 [#1] PREEMPT SMP
...
[ 3902.765136] RIP: 0010:xhci_trb_virt_to_dma.part.50+0x5/0x30
...
[ 3902.765178] Call Trace:
[ 3902.765188]  xhci_ring_enqueue_show+0x1e/0x40
[ 3902.765197]  seq_read+0xdb/0x3a0
[ 3902.765204]  ? __handle_mm_fault+0x5fb/0x1210
[ 3902.765211]  full_proxy_read+0x4a/0x70
[ 3902.765219]  __vfs_read+0x23/0x120
[ 3902.765228]  vfs_read+0x8e/0x130
[ 3902.765235]  SyS_read+0x42/0x90
[ 3902.765242]  do_syscall_64+0x6b/0x290
[ 3902.765251]  entry_SYSCALL64_slow_path+0x25/0x25

The issue is caused by the xhci ring structures being reallocated
when the system is resumed, but pointers to the old structures
being retained in the debugfs files "private" field:

The proposed patch fixes this issue by storing a pointer to the xhci_ring
field in the xhci device structure in debugfs rather than directly
storing a pointer to the xhci_ring.

Fixes: 02b6fdc2a153 ("usb: xhci: Add debugfs interface for xHCI driver")
Signed-off-by: Alexander Kappner <agk@godking.net>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci-debugfs.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/usb/host/xhci-debugfs.c b/drivers/usb/host/xhci-debugfs.c
index 4f7895dbcf88..e26e685d8a57 100644
--- a/drivers/usb/host/xhci-debugfs.c
+++ b/drivers/usb/host/xhci-debugfs.c
@@ -162,7 +162,7 @@ static void xhci_debugfs_extcap_regset(struct xhci_hcd *xhci, int cap_id,
 static int xhci_ring_enqueue_show(struct seq_file *s, void *unused)
 {
 	dma_addr_t		dma;
-	struct xhci_ring	*ring = s->private;
+	struct xhci_ring	*ring = *(struct xhci_ring **)s->private;
 
 	dma = xhci_trb_virt_to_dma(ring->enq_seg, ring->enqueue);
 	seq_printf(s, "%pad\n", &dma);
@@ -173,7 +173,7 @@ static int xhci_ring_enqueue_show(struct seq_file *s, void *unused)
 static int xhci_ring_dequeue_show(struct seq_file *s, void *unused)
 {
 	dma_addr_t		dma;
-	struct xhci_ring	*ring = s->private;
+	struct xhci_ring	*ring = *(struct xhci_ring **)s->private;
 
 	dma = xhci_trb_virt_to_dma(ring->deq_seg, ring->dequeue);
 	seq_printf(s, "%pad\n", &dma);
@@ -183,7 +183,7 @@ static int xhci_ring_dequeue_show(struct seq_file *s, void *unused)
 
 static int xhci_ring_cycle_show(struct seq_file *s, void *unused)
 {
-	struct xhci_ring	*ring = s->private;
+	struct xhci_ring	*ring = *(struct xhci_ring **)s->private;
 
 	seq_printf(s, "%d\n", ring->cycle_state);
 
@@ -346,7 +346,7 @@ static void xhci_debugfs_create_files(struct xhci_hcd *xhci,
 }
 
 static struct dentry *xhci_debugfs_create_ring_dir(struct xhci_hcd *xhci,
-						   struct xhci_ring *ring,
+						   struct xhci_ring **ring,
 						   const char *name,
 						   struct dentry *parent)
 {
@@ -387,7 +387,7 @@ void xhci_debugfs_create_endpoint(struct xhci_hcd *xhci,
 
 	snprintf(epriv->name, sizeof(epriv->name), "ep%02d", ep_index);
 	epriv->root = xhci_debugfs_create_ring_dir(xhci,
-						   dev->eps[ep_index].new_ring,
+						   &dev->eps[ep_index].new_ring,
 						   epriv->name,
 						   spriv->root);
 	spriv->eps[ep_index] = epriv;
@@ -423,7 +423,7 @@ void xhci_debugfs_create_slot(struct xhci_hcd *xhci, int slot_id)
 	priv->dev = dev;
 	dev->debugfs_private = priv;
 
-	xhci_debugfs_create_ring_dir(xhci, dev->eps[0].ring,
+	xhci_debugfs_create_ring_dir(xhci, &dev->eps[0].ring,
 				     "ep00", priv->root);
 
 	xhci_debugfs_create_context_files(xhci, priv->root, slot_id);
@@ -488,11 +488,11 @@ void xhci_debugfs_init(struct xhci_hcd *xhci)
 				   ARRAY_SIZE(xhci_extcap_dbc),
 				   "reg-ext-dbc");
 
-	xhci_debugfs_create_ring_dir(xhci, xhci->cmd_ring,
+	xhci_debugfs_create_ring_dir(xhci, &xhci->cmd_ring,
 				     "command-ring",
 				     xhci->debugfs_root);
 
-	xhci_debugfs_create_ring_dir(xhci, xhci->event_ring,
+	xhci_debugfs_create_ring_dir(xhci, &xhci->event_ring,
 				     "event-ring",
 				     xhci->debugfs_root);
 

From da99706689481717998d1d48edd389f339eea979 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Thu, 21 Dec 2017 15:06:15 +0200
Subject: [PATCH 147/305] usb: xhci: Add XHCI_TRUST_TX_LENGTH for Renesas
 uPD720201

When plugging in a USB webcam I see the following message:
xhci_hcd 0000:04:00.0: WARN Successful completion on short TX: needs
XHCI_TRUST_TX_LENGTH quirk?
handle_tx_event: 913 callbacks suppressed

All is quiet again with this patch (and I've done a fair but of soak
testing with the camera since).

Cc: <stable@vger.kernel.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci-pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 7ef1274ef7f7..1aad89b8aba0 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -177,6 +177,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
 		xhci->quirks |= XHCI_TRUST_TX_LENGTH;
 		xhci->quirks |= XHCI_BROKEN_STREAMS;
 	}
+	if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
+			pdev->device == 0x0014)
+		xhci->quirks |= XHCI_TRUST_TX_LENGTH;
 	if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
 			pdev->device == 0x0015)
 		xhci->quirks |= XHCI_RESET_ON_RESUME;

From 76dc6c097d581ad8eeedf8e1a000423a3d742445 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Tue, 26 Dec 2017 15:08:53 +0100
Subject: [PATCH 148/305] cpu/hotplug: Move inline keyword at the beginning of
 declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix non-fatal warnings such as:

kernel/cpu.c:95:1: warning: ‘inline’ is not at beginning of declaration [-Wold-style-declaration]
 static void inline cpuhp_lock_release(bool bringup) { }
 ^~~~~~

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: https://lkml.kernel.org/r/20171226140855.16583-1-malat@debian.org
---
 kernel/cpu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 41376c3ac93b..3d002a6f216e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map =
 	STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
 
 
-static void inline cpuhp_lock_acquire(bool bringup)
+static inline void cpuhp_lock_acquire(bool bringup)
 {
 	lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
 }
 
-static void inline cpuhp_lock_release(bool bringup)
+static inline void cpuhp_lock_release(bool bringup)
 {
 	lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
 }
 #else
 
-static void inline cpuhp_lock_acquire(bool bringup) { }
-static void inline cpuhp_lock_release(bool bringup) { }
+static inline void cpuhp_lock_acquire(bool bringup) { }
+static inline void cpuhp_lock_release(bool bringup) { }
 
 #endif
 

From 7ad1437d6ace0e450a6c1167720608ad660b191d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Dec 2017 19:45:31 +0100
Subject: [PATCH 149/305] perf/x86/intel: Plug memory leak in intel_pmu_init()

A recent commit introduced an extra merge_attr() call in the skylake
branch, which causes a memory leak.

Store the pointer to the extra allocated memory and free it at the end of
the function.

Fixes: a5df70c354c2 ("perf/x86: Only show format attributes when supported")
Reported-by: Tommi Rantala <tommi.t.rantala@nokia.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/events/intel/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 09c26a4f139c..731153a4681e 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3847,6 +3847,8 @@ static struct attribute *intel_pmu_attrs[] = {
 
 __init int intel_pmu_init(void)
 {
+	struct attribute **extra_attr = NULL;
+	struct attribute **to_free = NULL;
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
 	union cpuid10_ebx ebx;
@@ -3854,7 +3856,6 @@ __init int intel_pmu_init(void)
 	unsigned int unused;
 	struct extra_reg *er;
 	int version, i;
-	struct attribute **extra_attr = NULL;
 	char *name;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -4294,6 +4295,7 @@ __init int intel_pmu_init(void)
 		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
 			hsw_format_attr : nhm_format_attr;
 		extra_attr = merge_attr(extra_attr, skl_format_attr);
+		to_free = extra_attr;
 		x86_pmu.cpu_events = get_hsw_events_attrs();
 		intel_pmu_pebs_data_source_skl(
 			boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
@@ -4401,6 +4403,7 @@ __init int intel_pmu_init(void)
 		pr_cont("full-width counters, ");
 	}
 
+	kfree(to_free);
 	return 0;
 }
 

From 7ac139eaa6bbdb07c547b6916a808eab3897e0e3 Mon Sep 17 00:00:00 2001
From: rodrigosiqueira <rodrigosiqueiramelo@gmail.com>
Date: Fri, 15 Dec 2017 11:15:33 -0200
Subject: [PATCH 150/305] x86: Remove unused parameter of prepare_switch_to

Commit e37e43a497d5 ("x86/mm/64: Enable vmapped stacks
(CONFIG_HAVE_ARCH_VMAP_STACK=y)") added prepare_switch_to with one extra
parameter which is not used by the function, remove it.

Signed-off-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Link: https://lkml.kernel.org/r/20171215131533.hp6kqebw45o7uvsb@smtp.gmail.com
---
 arch/x86/include/asm/switch_to.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8c6bd6863db9..1008d4622709 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -16,8 +16,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss);
 
 /* This runs runs on the previous thread's stack. */
-static inline void prepare_switch_to(struct task_struct *prev,
-				     struct task_struct *next)
+static inline void prepare_switch_to(struct task_struct *next)
 {
 #ifdef CONFIG_VMAP_STACK
 	/*
@@ -70,7 +69,7 @@ struct fork_frame {
 
 #define switch_to(prev, next, last)					\
 do {									\
-	prepare_switch_to(prev, next);					\
+	prepare_switch_to(next);					\
 									\
 	((last) = __switch_to_asm((prev), (next)));			\
 } while (0)

From ac461122c88a10b7d775de2f56467f097c9e627a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 27 Dec 2017 11:48:50 -0800
Subject: [PATCH 151/305] x86-32: Fix kexec with stack canary
 (CONFIG_CC_STACKPROTECTOR)

Commit e802a51ede91 ("x86/idt: Consolidate IDT invalidation") cleaned up
and unified the IDT invalidation that existed in a couple of places.  It
changed no actual real code.

Despite not changing any actual real code, it _did_ change code generation:
by implementing the common idt_invalidate() function in
archx86/kernel/idt.c, it made the use of the function in
arch/x86/kernel/machine_kexec_32.c be a real function call rather than an
(accidental) inlining of the function.

That, in turn, exposed two issues:

 - in load_segments(), we had incorrectly reset all the segment
   registers, which then made the stack canary load (which gcc does
   using offset of %gs) cause a trap.  Instead of %gs pointing to the
   stack canary, it will be the normal zero-based kernel segment, and
   the stack canary load will take a page fault at address 0x14.

 - to make this even harder to debug, we had invalidated the GDT just
   before calling idt_invalidate(), which meant that the fault happened
   with an invalid GDT, which in turn causes a triple fault and
   immediate reboot.

Fix this by

 (a) not reloading the special segments in load_segments(). We currently
     don't do any percpu accesses (which would require %fs on x86-32) in
     this area, but there's no reason to think that we might not want to
     do them, and like %gs, it's pointless to break it.

 (b) doing idt_invalidate() before invalidating the GDT, to keep things
     at least _slightly_ more debuggable for a bit longer. Without a
     IDT, traps will not work. Without a GDT, traps also will not work,
     but neither will any segment loads etc. So in a very real sense,
     the GDT is even more core than the IDT.

Fixes: e802a51ede91 ("x86/idt: Consolidate IDT invalidation")
Reported-and-tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.LFD.2.21.1712271143180.8572@i7.lan
---
 arch/x86/kernel/machine_kexec_32.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 00bc751c861c..edfede768688 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -48,8 +48,6 @@ static void load_segments(void)
 		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
 		"\tmovl %%eax,%%ds\n"
 		"\tmovl %%eax,%%es\n"
-		"\tmovl %%eax,%%fs\n"
-		"\tmovl %%eax,%%gs\n"
 		"\tmovl %%eax,%%ss\n"
 		: : : "eax", "memory");
 #undef STR
@@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image)
 	 * The gdt & idt are now invalid.
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
-	set_gdt(phys_to_virt(0), 0);
 	idt_invalidate(phys_to_virt(0));
+	set_gdt(phys_to_virt(0), 0);
 
 	/* now call it */
 	image->start = relocate_kernel_ptr((unsigned long)image->head,

From 59585b4be9ae4dc6506551709bdcd6f5210b8a01 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@inai.de>
Date: Mon, 25 Dec 2017 03:43:53 +0100
Subject: [PATCH 152/305] sparc64: repair calling incorrect hweight function
 from stubs

Commit v4.12-rc4-1-g9289ea7f952b introduced a mistake that made the
64-bit hweight stub call the 16-bit hweight function.

Fixes: 9289ea7f952b ("sparc64: Use indirect calls in hamming weight stubs")
Signed-off-by: Jan Engelhardt <jengelh@inai.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/lib/hweight.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S
index e5547b22cd18..0ddbbb031822 100644
--- a/arch/sparc/lib/hweight.S
+++ b/arch/sparc/lib/hweight.S
@@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)
 	.previous
 
 ENTRY(__arch_hweight64)
-	sethi	%hi(__sw_hweight16), %g1
-	jmpl	%g1 + %lo(__sw_hweight16), %g0
+	sethi	%hi(__sw_hweight64), %g1
+	jmpl	%g1 + %lo(__sw_hweight64), %g0
 	 nop
 ENDPROC(__arch_hweight64)
 EXPORT_SYMBOL(__arch_hweight64)

From 39c3fd58952d7599d367c84c1330b785d91d6088 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 2 Dec 2017 18:11:04 +0100
Subject: [PATCH 153/305] kernel/irq: Extend lockdep class for request mutex

The IRQ code already has support for lockdep class for the lock mutex
in an interrupt descriptor. Extend this to add a second class for the
request mutex in the descriptor. Not having a class is resulting in
false positive splats in some code paths.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: linus.walleij@linaro.org
Cc: grygorii.strashko@ti.com
Cc: f.fainelli@gmail.com
Link: https://lkml.kernel.org/r/1512234664-21555-1-git-send-email-andrew@lunn.ch
---
 arch/powerpc/sysdev/fsl_msi.c             |  4 ++-
 drivers/gpio/gpio-bcm-kona.c              |  3 ++-
 drivers/gpio/gpio-brcmstb.c               |  4 ++-
 drivers/gpio/gpio-tegra.c                 |  4 ++-
 drivers/gpio/gpiolib.c                    | 27 ++++++++++++-------
 drivers/irqchip/irq-renesas-intc-irqpin.c |  6 ++++-
 drivers/mfd/arizona-irq.c                 |  4 ++-
 drivers/pinctrl/pinctrl-single.c          |  5 +++-
 include/linux/gpio/driver.h               | 33 ++++++++++++++---------
 include/linux/irqdesc.h                   |  9 ++++---
 kernel/irq/generic-chip.c                 | 11 +++++---
 11 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 44cbf4c12ea1..df95102e732c 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -354,6 +354,7 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)
 }
 
 static struct lock_class_key fsl_msi_irq_class;
+static struct lock_class_key fsl_msi_irq_request_class;
 
 static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 			       int offset, int irq_index)
@@ -373,7 +374,8 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 		dev_err(&dev->dev, "No memory for MSI cascade data\n");
 		return -ENOMEM;
 	}
-	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class);
+	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class,
+			      &fsl_msi_irq_request_class);
 	cascade_data->index = offset;
 	cascade_data->msi_data = msi;
 	cascade_data->virq = virt_msir;
diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c
index dfcf56ee3c61..76861a00bb92 100644
--- a/drivers/gpio/gpio-bcm-kona.c
+++ b/drivers/gpio/gpio-bcm-kona.c
@@ -522,6 +522,7 @@ static struct of_device_id const bcm_kona_gpio_of_match[] = {
  * category than their parents, so it won't report false recursion.
  */
 static struct lock_class_key gpio_lock_class;
+static struct lock_class_key gpio_request_class;
 
 static int bcm_kona_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 				 irq_hw_number_t hwirq)
@@ -531,7 +532,7 @@ static int bcm_kona_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 	ret = irq_set_chip_data(irq, d->host_data);
 	if (ret < 0)
 		return ret;
-	irq_set_lockdep_class(irq, &gpio_lock_class);
+	irq_set_lockdep_class(irq, &gpio_lock_class, &gpio_request_class);
 	irq_set_chip_and_handler(irq, &bcm_gpio_irq_chip, handle_simple_irq);
 	irq_set_noprobe(irq);
 
diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c
index 545d43a587b7..5b24801bffef 100644
--- a/drivers/gpio/gpio-brcmstb.c
+++ b/drivers/gpio/gpio-brcmstb.c
@@ -327,6 +327,7 @@ static struct brcmstb_gpio_bank *brcmstb_gpio_hwirq_to_bank(
  * category than their parents, so it won't report false recursion.
  */
 static struct lock_class_key brcmstb_gpio_irq_lock_class;
+static struct lock_class_key brcmstb_gpio_irq_request_class;
 
 
 static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq,
@@ -346,7 +347,8 @@ static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 	ret = irq_set_chip_data(irq, &bank->gc);
 	if (ret < 0)
 		return ret;
-	irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class);
+	irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class,
+			      &brcmstb_gpio_irq_lock_class);
 	irq_set_chip_and_handler(irq, &priv->irq_chip, handle_level_irq);
 	irq_set_noprobe(irq);
 	return 0;
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index 8db47f671708..02fa8fe2292a 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -565,6 +565,7 @@ static const struct dev_pm_ops tegra_gpio_pm_ops = {
  * than their parents, so it won't report false recursion.
  */
 static struct lock_class_key gpio_lock_class;
+static struct lock_class_key gpio_request_class;
 
 static int tegra_gpio_probe(struct platform_device *pdev)
 {
@@ -670,7 +671,8 @@ static int tegra_gpio_probe(struct platform_device *pdev)
 
 		bank = &tgi->bank_info[GPIO_BANK(gpio)];
 
-		irq_set_lockdep_class(irq, &gpio_lock_class);
+		irq_set_lockdep_class(irq, &gpio_lock_class,
+				      &gpio_request_class);
 		irq_set_chip_data(irq, bank);
 		irq_set_chip_and_handler(irq, &tgi->ic, handle_simple_irq);
 	}
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index aad84a6306c4..44332b793718 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -73,7 +73,8 @@ LIST_HEAD(gpio_devices);
 
 static void gpiochip_free_hogs(struct gpio_chip *chip);
 static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
-				struct lock_class_key *key);
+				struct lock_class_key *lock_key,
+				struct lock_class_key *request_key);
 static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip);
 static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip);
 static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gpiochip);
@@ -1100,7 +1101,8 @@ static void gpiochip_setup_devs(void)
 }
 
 int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
-			       struct lock_class_key *key)
+			       struct lock_class_key *lock_key,
+			       struct lock_class_key *request_key)
 {
 	unsigned long	flags;
 	int		status = 0;
@@ -1246,7 +1248,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
 	if (status)
 		goto err_remove_from_list;
 
-	status = gpiochip_add_irqchip(chip, key);
+	status = gpiochip_add_irqchip(chip, lock_key, request_key);
 	if (status)
 		goto err_remove_chip;
 
@@ -1632,7 +1634,7 @@ int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 	 * This lock class tells lockdep that GPIO irqs are in a different
 	 * category than their parents, so it won't report false recursion.
 	 */
-	irq_set_lockdep_class(irq, chip->irq.lock_key);
+	irq_set_lockdep_class(irq, chip->irq.lock_key, chip->irq.request_key);
 	irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq.handler);
 	/* Chips that use nested thread handlers have them marked */
 	if (chip->irq.threaded)
@@ -1712,10 +1714,12 @@ static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset)
 /**
  * gpiochip_add_irqchip() - adds an IRQ chip to a GPIO chip
  * @gpiochip: the GPIO chip to add the IRQ chip to
- * @lock_key: lockdep class
+ * @lock_key: lockdep class for IRQ lock
+ * @request_key: lockdep class for IRQ request
  */
 static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
-				struct lock_class_key *lock_key)
+				struct lock_class_key *lock_key,
+				struct lock_class_key *request_key)
 {
 	struct irq_chip *irqchip = gpiochip->irq.chip;
 	const struct irq_domain_ops *ops;
@@ -1753,6 +1757,7 @@ static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
 	gpiochip->to_irq = gpiochip_to_irq;
 	gpiochip->irq.default_type = type;
 	gpiochip->irq.lock_key = lock_key;
+	gpiochip->irq.request_key = request_key;
 
 	if (gpiochip->irq.domain_ops)
 		ops = gpiochip->irq.domain_ops;
@@ -1850,7 +1855,8 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
  * @type: the default type for IRQs on this irqchip, pass IRQ_TYPE_NONE
  * to have the core avoid setting up any default type in the hardware.
  * @threaded: whether this irqchip uses a nested thread handler
- * @lock_key: lockdep class
+ * @lock_key: lockdep class for IRQ lock
+ * @request_key: lockdep class for IRQ request
  *
  * This function closely associates a certain irqchip with a certain
  * gpiochip, providing an irq domain to translate the local IRQs to
@@ -1872,7 +1878,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 			     irq_flow_handler_t handler,
 			     unsigned int type,
 			     bool threaded,
-			     struct lock_class_key *lock_key)
+			     struct lock_class_key *lock_key,
+			     struct lock_class_key *request_key)
 {
 	struct device_node *of_node;
 
@@ -1913,6 +1920,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 	gpiochip->irq.default_type = type;
 	gpiochip->to_irq = gpiochip_to_irq;
 	gpiochip->irq.lock_key = lock_key;
+	gpiochip->irq.request_key = request_key;
 	gpiochip->irq.domain = irq_domain_add_simple(of_node,
 					gpiochip->ngpio, first_irq,
 					&gpiochip_domain_ops, gpiochip);
@@ -1940,7 +1948,8 @@ EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_key);
 #else /* CONFIG_GPIOLIB_IRQCHIP */
 
 static inline int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
-				       struct lock_class_key *key)
+				       struct lock_class_key *lock_key,
+				       struct lock_class_key *request_key)
 {
 	return 0;
 }
diff --git a/drivers/irqchip/irq-renesas-intc-irqpin.c b/drivers/irqchip/irq-renesas-intc-irqpin.c
index 06f29cf5018a..cee59fe1321c 100644
--- a/drivers/irqchip/irq-renesas-intc-irqpin.c
+++ b/drivers/irqchip/irq-renesas-intc-irqpin.c
@@ -342,6 +342,9 @@ static irqreturn_t intc_irqpin_shared_irq_handler(int irq, void *dev_id)
  */
 static struct lock_class_key intc_irqpin_irq_lock_class;
 
+/* And this is for the request mutex */
+static struct lock_class_key intc_irqpin_irq_request_class;
+
 static int intc_irqpin_irq_domain_map(struct irq_domain *h, unsigned int virq,
 				      irq_hw_number_t hw)
 {
@@ -352,7 +355,8 @@ static int intc_irqpin_irq_domain_map(struct irq_domain *h, unsigned int virq,
 
 	intc_irqpin_dbg(&p->irq[hw], "map");
 	irq_set_chip_data(virq, h->host_data);
-	irq_set_lockdep_class(virq, &intc_irqpin_irq_lock_class);
+	irq_set_lockdep_class(virq, &intc_irqpin_irq_lock_class,
+			      &intc_irqpin_irq_request_class);
 	irq_set_chip_and_handler(virq, &p->irq_chip, handle_level_irq);
 	return 0;
 }
diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c
index 09cf3699e354..a307832d7e45 100644
--- a/drivers/mfd/arizona-irq.c
+++ b/drivers/mfd/arizona-irq.c
@@ -184,6 +184,7 @@ static struct irq_chip arizona_irq_chip = {
 };
 
 static struct lock_class_key arizona_irq_lock_class;
+static struct lock_class_key arizona_irq_request_class;
 
 static int arizona_irq_map(struct irq_domain *h, unsigned int virq,
 			      irq_hw_number_t hw)
@@ -191,7 +192,8 @@ static int arizona_irq_map(struct irq_domain *h, unsigned int virq,
 	struct arizona *data = h->host_data;
 
 	irq_set_chip_data(virq, data);
-	irq_set_lockdep_class(virq, &arizona_irq_lock_class);
+	irq_set_lockdep_class(virq, &arizona_irq_lock_class,
+		&arizona_irq_request_class);
 	irq_set_chip_and_handler(virq, &arizona_irq_chip, handle_simple_irq);
 	irq_set_nested_thread(virq, 1);
 	irq_set_noprobe(virq);
diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c
index e6cd8de793e2..3501491e5bfc 100644
--- a/drivers/pinctrl/pinctrl-single.c
+++ b/drivers/pinctrl/pinctrl-single.c
@@ -222,6 +222,9 @@ static enum pin_config_param pcs_bias[] = {
  */
 static struct lock_class_key pcs_lock_class;
 
+/* Class for the IRQ request mutex */
+static struct lock_class_key pcs_request_class;
+
 /*
  * REVISIT: Reads and writes could eventually use regmap or something
  * generic. But at least on omaps, some mux registers are performance
@@ -1486,7 +1489,7 @@ static int pcs_irqdomain_map(struct irq_domain *d, unsigned int irq,
 	irq_set_chip_data(irq, pcs_soc);
 	irq_set_chip_and_handler(irq, &pcs->chip,
 				 handle_level_irq);
-	irq_set_lockdep_class(irq, &pcs_lock_class);
+	irq_set_lockdep_class(irq, &pcs_lock_class, &pcs_request_class);
 	irq_set_noprobe(irq);
 
 	return 0;
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 55e672592fa9..7258cd676df4 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -66,9 +66,10 @@ struct gpio_irq_chip {
 	/**
 	 * @lock_key:
 	 *
-	 * Per GPIO IRQ chip lockdep class.
+	 * Per GPIO IRQ chip lockdep classes.
 	 */
 	struct lock_class_key *lock_key;
+	struct lock_class_key *request_key;
 
 	/**
 	 * @parent_handler:
@@ -323,7 +324,8 @@ extern const char *gpiochip_is_requested(struct gpio_chip *chip,
 
 /* add/remove chips */
 extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
-				      struct lock_class_key *lock_key);
+				      struct lock_class_key *lock_key,
+				      struct lock_class_key *request_key);
 
 /**
  * gpiochip_add_data() - register a gpio_chip
@@ -350,11 +352,13 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
  */
 #ifdef CONFIG_LOCKDEP
 #define gpiochip_add_data(chip, data) ({		\
-		static struct lock_class_key key;	\
-		gpiochip_add_data_with_key(chip, data, &key);	\
+		static struct lock_class_key lock_key;	\
+		static struct lock_class_key request_key;	  \
+		gpiochip_add_data_with_key(chip, data, &lock_key, \
+					   &request_key);	  \
 	})
 #else
-#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL)
+#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL, NULL)
 #endif
 
 static inline int gpiochip_add(struct gpio_chip *chip)
@@ -429,7 +433,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 			     irq_flow_handler_t handler,
 			     unsigned int type,
 			     bool threaded,
-			     struct lock_class_key *lock_key);
+			     struct lock_class_key *lock_key,
+			     struct lock_class_key *request_key);
 
 #ifdef CONFIG_LOCKDEP
 
@@ -445,10 +450,12 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
 				       irq_flow_handler_t handler,
 				       unsigned int type)
 {
-	static struct lock_class_key key;
+	static struct lock_class_key lock_key;
+	static struct lock_class_key request_key;
 
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, false, &key);
+					handler, type, false,
+					&lock_key, &request_key);
 }
 
 static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
@@ -458,10 +465,12 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
 			  unsigned int type)
 {
 
-	static struct lock_class_key key;
+	static struct lock_class_key lock_key;
+	static struct lock_class_key request_key;
 
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, true, &key);
+					handler, type, true,
+					&lock_key, &request_key);
 }
 #else
 static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
@@ -471,7 +480,7 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
 				       unsigned int type)
 {
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, false, NULL);
+					handler, type, false, NULL, NULL);
 }
 
 static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
@@ -481,7 +490,7 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
 			  unsigned int type)
 {
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, true, NULL);
+					handler, type, true, NULL, NULL);
 }
 #endif /* CONFIG_LOCKDEP */
 
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 39fb3700f7a9..25b33b664537 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -255,12 +255,15 @@ static inline bool irq_is_percpu_devid(unsigned int irq)
 }
 
 static inline void
-irq_set_lockdep_class(unsigned int irq, struct lock_class_key *class)
+irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+		      struct lock_class_key *request_class)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (desc)
-		lockdep_set_class(&desc->lock, class);
+	if (desc) {
+		lockdep_set_class(&desc->lock, lock_class);
+		lockdep_set_class(&desc->request_mutex, request_class);
+	}
 }
 
 #ifdef CONFIG_IRQ_PREFLOW_FASTEOI
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c26c5bb6b491..508c03dfef25 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
 EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
 
 /*
- * Separate lockdep class for interrupt chip which can nest irq_desc
- * lock.
+ * Separate lockdep classes for interrupt chip which can nest irq_desc
+ * lock and request mutex.
  */
 static struct lock_class_key irq_nested_lock_class;
+static struct lock_class_key irq_nested_request_class;
 
 /*
  * irq_map_generic_chip - Map a generic chip for an irq domain
@@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
 	set_bit(idx, &gc->installed);
 
 	if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
-		irq_set_lockdep_class(virq, &irq_nested_lock_class);
+		irq_set_lockdep_class(virq, &irq_nested_lock_class,
+				      &irq_nested_request_class);
 
 	if (chip->irq_calc_mask)
 		chip->irq_calc_mask(data);
@@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
 			continue;
 
 		if (flags & IRQ_GC_INIT_NESTED_LOCK)
-			irq_set_lockdep_class(i, &irq_nested_lock_class);
+			irq_set_lockdep_class(i, &irq_nested_lock_class,
+					      &irq_nested_request_class);
 
 		if (!(flags & IRQ_GC_NO_MASK)) {
 			struct irq_data *d = irq_get_irq_data(i);

From 466a2b42d67644447a1765276259a3ea5531ddff Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 21 Dec 2017 02:22:45 +0100
Subject: [PATCH 154/305] cpufreq: schedutil: Use idle_calls counter of the
 remote CPU

Since the recent remote cpufreq callback work, its possible that a cpufreq
update is triggered from a remote CPU. For single policies however, the current
code uses the local CPU when trying to determine if the remote sg_cpu entered
idle or is busy. This is incorrect. To remedy this, compare with the nohz tick
idle_calls counter of the remote CPU.

Fixes: 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks)
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Cc: 4.14+ <stable@vger.kernel.org> # 4.14+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/tick.h             |  1 +
 kernel/sched/cpufreq_schedutil.c |  2 +-
 kernel/time/tick-sched.c         | 13 +++++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index f442d1a42025..7cc35921218e 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -119,6 +119,7 @@ extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern unsigned long tick_nohz_get_idle_calls(void);
+extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2f52ec0f1539..d6717a3331a1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 #ifdef CONFIG_NO_HZ_COMMON
 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 {
-	unsigned long idle_calls = tick_nohz_get_idle_calls();
+	unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
 	bool ret = idle_calls == sg_cpu->saved_idle_calls;
 
 	sg_cpu->saved_idle_calls = idle_calls;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99578f06c8d4..77555faf6fbc 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -985,6 +985,19 @@ ktime_t tick_nohz_get_sleep_length(void)
 	return ts->sleep_length;
 }
 
+/**
+ * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
+ * for a particular CPU.
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
+{
+	struct tick_sched *ts = tick_get_tick_sched(cpu);
+
+	return ts->idle_calls;
+}
+
 /**
  * tick_nohz_get_idle_calls - return the current idle calls counter value
  *

From 11bca0a83f83f6093d816295668e74ef24595944 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sat, 2 Dec 2017 09:13:04 -0800
Subject: [PATCH 155/305] genirq: Guard handle_bad_irq log messages

An interrupt storm on a bad interrupt will cause the kernel
log to be clogged.

[   60.089234] ->handle_irq():  ffffffffbe2f803f,
[   60.090455] 0xffffffffbf2af380
[   60.090510] handle_bad_irq+0x0/0x2e5
[   60.090522] ->irq_data.chip(): ffffffffbf2af380,
[   60.090553]    IRQ_NOPROBE set
[   60.090584] ->handle_irq():  ffffffffbe2f803f,
[   60.090590] handle_bad_irq+0x0/0x2e5
[   60.090596] ->irq_data.chip(): ffffffffbf2af380,
[   60.090602] 0xffffffffbf2af380
[   60.090608] ->action():           (null)
[   60.090779] handle_bad_irq+0x0/0x2e5

This was seen when running an upstream kernel on Acer Chromebook R11.  The
system was unstable as result.

Guard the log message with __printk_ratelimit to reduce the impact.  This
won't prevent the interrupt storm from happening, but at least the system
remains stable.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Dmitry Torokhov <dtor@chromium.org>
Cc: Joe Perches <joe@perches.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=197953
Link: https://lkml.kernel.org/r/1512234784-21038-1-git-send-email-linux@roeck-us.net
---
 kernel/irq/debug.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 17f05ef8f575..e4d3819a91cc 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -12,6 +12,11 @@
 
 static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
+	static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
+
+	if (!__ratelimit(&ratelimit))
+		return;
+
 	printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
 		irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
 	printk("->handle_irq():  %p, ", desc->handle_irq);

From 4fcab6693445cfb84f2b65868c58043535090e52 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Mon, 4 Dec 2017 12:03:12 +0800
Subject: [PATCH 156/305] x86/apic: Avoid wrong warning when parsing 'apic=' in
 X86-32 case

There are two consumers of apic=:
  apic_set_verbosity() for setting the APIC debug level;
  parse_apic() for registering APIC driver by hand.

X86-32 supports both of them, but sometimes, kernel issues a weird warning.
eg: when kernel was booted up with 'apic=bigsmp' in command line,
early_param would warn like that:

...
[    0.000000] APIC Verbosity level bigsmp not recognised use apic=verbose or apic=debug
[    0.000000] Malformed early option 'apic'
...

Wrap the warning code in CONFIG_X86_64 case to avoid this.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Cc: rdunlap@infradead.org
Cc: corbet@lwn.net
Link: https://lkml.kernel.org/r/20171204040313.24824-1-douly.fnst@cn.fujitsu.com
---
 arch/x86/kernel/apic/apic.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6e272f3ea984..880441f24146 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2626,11 +2626,13 @@ static int __init apic_set_verbosity(char *arg)
 		apic_verbosity = APIC_DEBUG;
 	else if (strcmp("verbose", arg) == 0)
 		apic_verbosity = APIC_VERBOSE;
+#ifdef CONFIG_X86_64
 	else {
 		pr_warning("APIC Verbosity level %s not recognised"
 			" use apic=verbose or apic=debug\n", arg);
 		return -EINVAL;
 	}
+#endif
 
 	return 0;
 }

From 64e05d118e357bb52a084b609436acf292ce7944 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Mon, 4 Dec 2017 12:03:13 +0800
Subject: [PATCH 157/305] x86/apic: Update the 'apic=' description of setting
 APIC driver

There are two consumers of apic=: the APIC debug level and the low
level generic architecture code, but Linux just documented the first
one.

Append the second description.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Cc: rdunlap@infradead.org
Cc: corbet@lwn.net
Link: https://lkml.kernel.org/r/20171204040313.24824-2-douly.fnst@cn.fujitsu.com
---
 Documentation/admin-guide/kernel-parameters.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b74e13312fdc..852fb11dd2c9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -328,11 +328,15 @@
 			not play well with APC CPU idle - disable it if you have
 			APC and your system crashes randomly.
 
-	apic=		[APIC,X86-32] Advanced Programmable Interrupt Controller
+	apic=		[APIC,X86] Advanced Programmable Interrupt Controller
 			Change the output verbosity whilst booting
 			Format: { quiet (default) | verbose | debug }
 			Change the amount of debugging information output
 			when initialising the APIC and IO-APIC components.
+			For X86-32, this can also be used to specify an APIC
+			driver name.
+			Format: apic=driver_name
+			Examples: apic=bigsmp
 
 	apic_extnmi=	[APIC,X86] External NMI delivery setting
 			Format: { bsp (default) | all | none }

From e7e83dd3ff1dd2f9e60213f6eedc7e5b08192062 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Tue, 26 Dec 2017 15:27:20 -0600
Subject: [PATCH 158/305] objtool: Fix Clang enum conversion warning

Fix the following Clang enum conversion warning:

  arch/x86/decode.c:141:20: error: implicit conversion from enumeration
  type 'enum op_src_type' to different enumeration
  type 'enum op_dest_type' [-Werror,-Wenum-conversion]

    op->dest.type = OP_SRC_REG;
		  ~ ^~~~~~~~~~

It just happened to work before because OP_SRC_REG and OP_DEST_REG have
the same value.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Nicholas Mc Guire <der.herr@hofr.at>
Reviewed-by: Nick Desaulniers <nick.desaulniers@gmail.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: baa41469a7b9 ("objtool: Implement stack validation 2.0")
Link: http://lkml.kernel.org/r/b4156c5738bae781c392e7a3691aed4514ebbdf2.1514323568.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/arch/x86/decode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 8acfc47af70e..540a209b78ab 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -138,7 +138,7 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 			*type = INSN_STACK;
 			op->src.type = OP_SRC_ADD;
 			op->src.reg = op_to_cfi_reg[modrm_reg][rex_r];
-			op->dest.type = OP_SRC_REG;
+			op->dest.type = OP_DEST_REG;
 			op->dest.reg = CFI_SP;
 		}
 		break;

From a31e58e129f73ab5b04016330b13ed51fde7a961 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 28 Dec 2017 11:33:33 +0100
Subject: [PATCH 159/305] x86/apic: Switch all APICs to Fixed delivery mode

Some of the APIC incarnations are operating in lowest priority delivery
mode. This worked as long as the vector management code allocated the same
vector on all possible CPUs for each interrupt.

Lowest priority delivery mode does not necessarily respect the affinity
setting and may redirect to some other online CPU. This was documented
somewhere in the old code and the conversion to single target delivery
missed to update the delivery mode of the affected APIC drivers which
results in spurious interrupts on some of the affected CPU/Chipset
combinations.

Switch the APIC drivers over to Fixed delivery mode and remove all
leftovers of lowest priority delivery mode.

Switching to Fixed delivery mode is not a problem on these CPUs because the
kernel already uses Fixed delivery mode for IPIs. The reason for this is
that th SDM explicitely forbids lowest prio mode for IPIs. The reason is
obvious: If the irq routing does not honor destination targets in lowest
prio mode then an IPI targeted at CPU1 might end up on CPU0, which would be
a fatal problem in many cases.

As a consequence of this change, the apic::irq_delivery_mode field is now
pointless, but this needs to be cleaned up in a separate patch.

Fixes: fdba46ffb4c2 ("x86/apic: Get rid of multi CPU affinity")
Reported-by: vcaputo@pengaru.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: vcaputo@pengaru.com
Cc: Pavel Machek <pavel@ucw.cz>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712281140440.1688@nanos
---
 arch/x86/kernel/apic/apic_flat_64.c   | 2 +-
 arch/x86/kernel/apic/apic_noop.c      | 2 +-
 arch/x86/kernel/apic/msi.c            | 8 ++------
 arch/x86/kernel/apic/probe_32.c       | 2 +-
 arch/x86/kernel/apic/x2apic_cluster.c | 2 +-
 drivers/pci/host/pci-hyperv.c         | 8 ++------
 6 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index aa85690e9b64..25a87028cb3f 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= flat_apic_id_registered,
 
-	.irq_delivery_mode		= dest_LowestPrio,
+	.irq_delivery_mode		= dest_Fixed,
 	.irq_dest_mode			= 1, /* logical */
 
 	.disable_esr			= 0,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 7b659c4480c9..5078b5ce63a7 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= noop_apic_id_registered,
 
-	.irq_delivery_mode		= dest_LowestPrio,
+	.irq_delivery_mode		= dest_Fixed,
 	/* logical delivery broadcast to all CPUs: */
 	.irq_dest_mode			= 1,
 
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 9b18be764422..ce503c99f5c4 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
 		((apic->irq_dest_mode == 0) ?
 			MSI_ADDR_DEST_MODE_PHYSICAL :
 			MSI_ADDR_DEST_MODE_LOGICAL) |
-		((apic->irq_delivery_mode != dest_LowestPrio) ?
-			MSI_ADDR_REDIRECTION_CPU :
-			MSI_ADDR_REDIRECTION_LOWPRI) |
+		MSI_ADDR_REDIRECTION_CPU |
 		MSI_ADDR_DEST_ID(cfg->dest_apicid);
 
 	msg->data =
 		MSI_DATA_TRIGGER_EDGE |
 		MSI_DATA_LEVEL_ASSERT |
-		((apic->irq_delivery_mode != dest_LowestPrio) ?
-			MSI_DATA_DELIVERY_FIXED :
-			MSI_DATA_DELIVERY_LOWPRI) |
+		MSI_DATA_DELIVERY_FIXED |
 		MSI_DATA_VECTOR(cfg->vector);
 }
 
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fa22017de806..02e8acb134f8 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= default_apic_id_registered,
 
-	.irq_delivery_mode		= dest_LowestPrio,
+	.irq_delivery_mode		= dest_Fixed,
 	/* logical delivery broadcast to all CPUs: */
 	.irq_dest_mode			= 1,
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 622f13ca8a94..8b04234e010b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
 	.apic_id_valid			= x2apic_apic_id_valid,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
-	.irq_delivery_mode		= dest_LowestPrio,
+	.irq_delivery_mode		= dest_Fixed,
 	.irq_dest_mode			= 1, /* logical */
 
 	.disable_esr			= 0,
diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c
index 0fe3ea164ee5..e7d94473aedd 100644
--- a/drivers/pci/host/pci-hyperv.c
+++ b/drivers/pci/host/pci-hyperv.c
@@ -985,9 +985,7 @@ static u32 hv_compose_msi_req_v1(
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
 	int_pkt->int_desc.vector_count = 1;
-	int_pkt->int_desc.delivery_mode =
-		(apic->irq_delivery_mode == dest_LowestPrio) ?
-			dest_LowestPrio : dest_Fixed;
+	int_pkt->int_desc.delivery_mode = dest_Fixed;
 
 	/*
 	 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
@@ -1008,9 +1006,7 @@ static u32 hv_compose_msi_req_v2(
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
 	int_pkt->int_desc.vector_count = 1;
-	int_pkt->int_desc.delivery_mode =
-		(apic->irq_delivery_mode == dest_LowestPrio) ?
-			dest_LowestPrio : dest_Fixed;
+	int_pkt->int_desc.delivery_mode = dest_Fixed;
 
 	/*
 	 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten

From 8880c13734af33635118a1e9567dadc7f9ddb7a8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 16:29:15 +0100
Subject: [PATCH 160/305] gpio: brcmstb: Make really use of the new lockdep
 class

The recent extension of irq_set_lockdep_class() with a second argument
added the new lockdep class to the mrcmstb driver, but used the already
existing lockdep class as second argument, which leaves the new lockdep
class defined but unused.

Use the new lockdep class as that's what the change intended to do.

Fixes: 39c3fd58952d ("kernel/irq: Extend lockdep class for request mutex")
Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: linus.walleij@linaro.org
---
 drivers/gpio/gpio-brcmstb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c
index 5b24801bffef..bb4f8cf18bd9 100644
--- a/drivers/gpio/gpio-brcmstb.c
+++ b/drivers/gpio/gpio-brcmstb.c
@@ -348,7 +348,7 @@ static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 	if (ret < 0)
 		return ret;
 	irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class,
-			      &brcmstb_gpio_irq_lock_class);
+			      &brcmstb_gpio_irq_request_class);
 	irq_set_chip_and_handler(irq, &priv->irq_chip, handle_level_irq);
 	irq_set_noprobe(irq);
 	return 0;

From da5dd9e854d2edd6b02ebfe28583052f922104da Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 10:42:10 +0100
Subject: [PATCH 161/305] genirq/msi: Handle reactivation only on success

When analyzing the fallout of the x86 vector allocation rework it turned
out that the error handling in msi_domain_alloc_irqs() is broken.

If MSI_FLAG_MUST_REACTIVATE is set for a MSI domain then it clears the
activation flag for a successfully initialized msi descriptor. If a
subsequent initialization fails then the error handling code path does not
deactivate the interrupt because the activation flag got cleared.

Move the clearing of the activation flag outside of the initialization loop
so that an eventual failure can be cleaned up correctly.

Fixes: 22d0b12f3560 ("genirq/irqdomain: Add force reactivation flag to irq domains")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
---
 kernel/irq/msi.c | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index edb987b2c58d..9ba954331171 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,6 +339,13 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
 	return ret;
 }
 
+static bool msi_check_reservation_mode(struct msi_domain_info *info)
+{
+	if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
+		return false;
+	return true;
+}
+
 /**
  * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
  * @domain:	The domain to allocate from
@@ -353,9 +360,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 {
 	struct msi_domain_info *info = domain->host_data;
 	struct msi_domain_ops *ops = info->ops;
-	msi_alloc_info_t arg;
+	struct irq_data *irq_data;
 	struct msi_desc *desc;
+	msi_alloc_info_t arg;
 	int i, ret, virq;
+	bool can_reserve;
 
 	ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
 	if (ret)
@@ -385,6 +394,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	if (ops->msi_finish)
 		ops->msi_finish(&arg, 0);
 
+	can_reserve = msi_check_reservation_mode(info);
+
 	for_each_msi_entry(desc, dev) {
 		virq = desc->irq;
 		if (desc->nvec_used == 1)
@@ -397,15 +408,23 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 		 * the MSI entries before the PCI layer enables MSI in the
 		 * card. Otherwise the card latches a random msi message.
 		 */
-		if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
-			struct irq_data *irq_data;
+		if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
+			continue;
 
+		irq_data = irq_domain_get_irq_data(domain, desc->irq);
+		ret = irq_domain_activate_irq(irq_data, true);
+		if (ret)
+			goto cleanup;
+	}
+
+	/*
+	 * If these interrupts use reservation mode, clear the activated bit
+	 * so request_irq() will assign the final vector.
+	 */
+	if (can_reserve) {
+		for_each_msi_entry(desc, dev) {
 			irq_data = irq_domain_get_irq_data(domain, desc->irq);
-			ret = irq_domain_activate_irq(irq_data, true);
-			if (ret)
-				goto cleanup;
-			if (info->flags & MSI_FLAG_MUST_REACTIVATE)
-				irqd_clr_activated(irq_data);
+			irqd_clr_activated(irq_data);
 		}
 	}
 	return 0;

From 69790ba92b8d67eaee5e50b30a5b696d40664caf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 16:44:34 +0100
Subject: [PATCH 162/305] genirq: Introduce IRQD_CAN_RESERVE flag

Add a new flag to mark interrupts which can use reservation mode. This is
going to be used in subsequent patches to disable reservation mode for a
certain class of MSI devices.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
---
 include/linux/irq.h  | 17 +++++++++++++++++
 kernel/irq/debugfs.c |  1 +
 2 files changed, 18 insertions(+)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index e140f69163b6..a0231e96a578 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -212,6 +212,7 @@ struct irq_data {
  *				  mask. Applies only to affinity managed irqs.
  * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
  * IRQD_DEFAULT_TRIGGER_SET	- Expected trigger already been set
+ * IRQD_CAN_RESERVE		- Can use reservation mode
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -233,6 +234,7 @@ enum {
 	IRQD_MANAGED_SHUTDOWN		= (1 << 23),
 	IRQD_SINGLE_TARGET		= (1 << 24),
 	IRQD_DEFAULT_TRIGGER_SET	= (1 << 25),
+	IRQD_CAN_RESERVE		= (1 << 26),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -377,6 +379,21 @@ static inline bool irqd_is_managed_and_shutdown(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN;
 }
 
+static inline void irqd_set_can_reserve(struct irq_data *d)
+{
+	__irqd_to_state(d) |= IRQD_CAN_RESERVE;
+}
+
+static inline void irqd_clr_can_reserve(struct irq_data *d)
+{
+	__irqd_to_state(d) &= ~IRQD_CAN_RESERVE;
+}
+
+static inline bool irqd_can_reserve(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_CAN_RESERVE;
+}
+
 #undef __irqd_to_state
 
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 7f608ac39653..acfaaef8672a 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = {
 	BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
 	BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
 	BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+	BIT_MASK_DESCR(IRQD_CAN_RESERVE),
 
 	BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
 

From 945f50a591783ac6e9bd59694f34d1ba03b778a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 16:57:00 +0100
Subject: [PATCH 163/305] x86/vector: Use IRQD_CAN_RESERVE flag

Set the new CAN_RESERVE flag when the initial reservation for an interrupt
happens. The flag is used in a subsequent patch to disable reservation mode
for a certain class of MSI devices.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
---
 arch/x86/kernel/apic/vector.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 750449152b04..1e969dba0476 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(struct irq_data *irqd)
 	irq_matrix_reserve(vector_matrix);
 	apicd->can_reserve = true;
 	apicd->has_reserved = true;
+	irqd_set_can_reserve(irqd);
 	trace_vector_reserve(irqd->irq, 0);
 	vector_assign_managed_shutdown(irqd);
 }
@@ -478,6 +479,7 @@ static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
 	} else {
 		/* Release the vector */
 		apicd->can_reserve = true;
+		irqd_set_can_reserve(irqd);
 		clear_irq_vector(irqd);
 		realloc = true;
 	}

From 702cb0a02813299d6911b775c637906ae21b737d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 16:59:06 +0100
Subject: [PATCH 164/305] genirq/irqdomain: Rename early argument of
 irq_domain_activate_irq()

The 'early' argument of irq_domain_activate_irq() is actually used to
denote reservation mode. To avoid confusion, rename it before abuse
happens.

No functional change.

Fixes: 72491643469a ("genirq/irqdomain: Update irq_domain_ops.activate() signature")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexandru Chirvasitu <achirvasub@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
---
 arch/x86/include/asm/irqdomain.h         |  2 +-
 arch/x86/include/asm/trace/irq_vectors.h | 16 ++++++++--------
 arch/x86/kernel/apic/io_apic.c           |  2 +-
 arch/x86/kernel/apic/vector.c            |  6 +++---
 arch/x86/platform/uv/uv_irq.c            |  2 +-
 drivers/gpio/gpio-xgene-sb.c             |  2 +-
 drivers/iommu/amd_iommu.c                |  2 +-
 drivers/iommu/intel_irq_remapping.c      |  2 +-
 drivers/irqchip/irq-gic-v3-its.c         |  4 ++--
 drivers/pinctrl/stm32/pinctrl-stm32.c    |  2 +-
 include/linux/irqdomain.h                |  2 +-
 kernel/irq/internals.h                   |  2 +-
 kernel/irq/irqdomain.c                   | 13 +++++++------
 13 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index 139feef467f7..c066ffae222b 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -44,7 +44,7 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nr_irqs);
 extern int mp_irqdomain_activate(struct irq_domain *domain,
-				 struct irq_data *irq_data, bool early);
+				 struct irq_data *irq_data, bool reserve);
 extern void mp_irqdomain_deactivate(struct irq_domain *domain,
 				    struct irq_data *irq_data);
 extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 84b9ec0c1bc0..22647a642e98 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -283,34 +283,34 @@ TRACE_EVENT(vector_alloc_managed,
 DECLARE_EVENT_CLASS(vector_activate,
 
 	TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve,
-		 bool early),
+		 bool reserve),
 
-	TP_ARGS(irq, is_managed, can_reserve, early),
+	TP_ARGS(irq, is_managed, can_reserve, reserve),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int,	irq		)
 		__field(	bool,		is_managed	)
 		__field(	bool,		can_reserve	)
-		__field(	bool,		early		)
+		__field(	bool,		reserve		)
 	),
 
 	TP_fast_assign(
 		__entry->irq		= irq;
 		__entry->is_managed	= is_managed;
 		__entry->can_reserve	= can_reserve;
-		__entry->early		= early;
+		__entry->reserve	= reserve;
 	),
 
-	TP_printk("irq=%u is_managed=%d can_reserve=%d early=%d",
+	TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d",
 		  __entry->irq, __entry->is_managed, __entry->can_reserve,
-		  __entry->early)
+		  __entry->reserve)
 );
 
 #define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name)				\
 DEFINE_EVENT_FN(vector_activate, name,					\
 	TP_PROTO(unsigned int irq, bool is_managed,			\
-		 bool can_reserve, bool early),				\
-	TP_ARGS(irq, is_managed, can_reserve, early), NULL, NULL);	\
+		 bool can_reserve, bool reserve),			\
+	TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL);	\
 
 DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate);
 DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 201579dc5242..8a7963421460 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 }
 
 int mp_irqdomain_activate(struct irq_domain *domain,
-			  struct irq_data *irq_data, bool early)
+			  struct irq_data *irq_data, bool reserve)
 {
 	unsigned long flags;
 
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 1e969dba0476..52c85c8147e9 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -399,21 +399,21 @@ static int activate_managed(struct irq_data *irqd)
 }
 
 static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
-			       bool early)
+			       bool reserve)
 {
 	struct apic_chip_data *apicd = apic_chip_data(irqd);
 	unsigned long flags;
 	int ret = 0;
 
 	trace_vector_activate(irqd->irq, apicd->is_managed,
-			      apicd->can_reserve, early);
+			      apicd->can_reserve, reserve);
 
 	/* Nothing to do for fixed assigned vectors */
 	if (!apicd->can_reserve && !apicd->is_managed)
 		return 0;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	if (early || irqd_is_managed_and_shutdown(irqd))
+	if (reserve || irqd_is_managed_and_shutdown(irqd))
 		vector_assign_managed_shutdown(irqd);
 	else if (apicd->is_managed)
 		ret = activate_managed(irqd);
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 5f6fd860820a..e4cb9f4cde8a 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -128,7 +128,7 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
  * on the specified blade to allow the sending of MSIs to the specified CPU.
  */
 static int uv_domain_activate(struct irq_domain *domain,
-			      struct irq_data *irq_data, bool early)
+			      struct irq_data *irq_data, bool reserve)
 {
 	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 	return 0;
diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c
index 2313af82fad3..acd59113e08b 100644
--- a/drivers/gpio/gpio-xgene-sb.c
+++ b/drivers/gpio/gpio-xgene-sb.c
@@ -139,7 +139,7 @@ static int xgene_gpio_sb_to_irq(struct gpio_chip *gc, u32 gpio)
 
 static int xgene_gpio_sb_domain_activate(struct irq_domain *d,
 					 struct irq_data *irq_data,
-					 bool early)
+					 bool reserve)
 {
 	struct xgene_gpio_sb *priv = d->host_data;
 	u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq);
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 7d5eb004091d..97baf88d9505 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4184,7 +4184,7 @@ static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
 			       struct irq_cfg *cfg);
 
 static int irq_remapping_activate(struct irq_domain *domain,
-				  struct irq_data *irq_data, bool early)
+				  struct irq_data *irq_data, bool reserve)
 {
 	struct amd_ir_data *data = irq_data->chip_data;
 	struct irq_2_irte *irte_info = &data->irq_2_irte;
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 76a193c7fcfc..66f69af2c219 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1397,7 +1397,7 @@ static void intel_irq_remapping_free(struct irq_domain *domain,
 }
 
 static int intel_irq_remapping_activate(struct irq_domain *domain,
-					struct irq_data *irq_data, bool early)
+					struct irq_data *irq_data, bool reserve)
 {
 	intel_ir_reconfigure_irte(irq_data, true);
 	return 0;
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 4039e64cd342..06f025fd5726 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2303,7 +2303,7 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 }
 
 static int its_irq_domain_activate(struct irq_domain *domain,
-				   struct irq_data *d, bool early)
+				   struct irq_data *d, bool reserve)
 {
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
 	u32 event = its_get_event_id(d);
@@ -2818,7 +2818,7 @@ static int its_vpe_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
 }
 
 static int its_vpe_irq_domain_activate(struct irq_domain *domain,
-				       struct irq_data *d, bool early)
+				       struct irq_data *d, bool reserve)
 {
 	struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
 	struct its_node *its;
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index a276c61be217..e62ab087bfd8 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -290,7 +290,7 @@ static int stm32_gpio_domain_translate(struct irq_domain *d,
 }
 
 static int stm32_gpio_domain_activate(struct irq_domain *d,
-				      struct irq_data *irq_data, bool early)
+				      struct irq_data *irq_data, bool reserve)
 {
 	struct stm32_gpio_bank *bank = d->host_data;
 	struct stm32_pinctrl *pctl = dev_get_drvdata(bank->gpio_chip.parent);
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index a34355d19546..48c7e86bb556 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -113,7 +113,7 @@ struct irq_domain_ops {
 		     unsigned int nr_irqs, void *arg);
 	void (*free)(struct irq_domain *d, unsigned int virq,
 		     unsigned int nr_irqs);
-	int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool early);
+	int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool reserve);
 	void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data);
 	int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec,
 			 unsigned long *out_hwirq, unsigned int *out_type);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 07d08ca701ec..ab19371eab9b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
 #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
 {
 	irqd_set_activated(data);
 	return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4f4f60015e8a..62068ad46930 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
 	}
 }
 
-static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
 {
 	int ret = 0;
 
@@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
 
 		if (irqd->parent_data)
 			ret = __irq_domain_activate_irq(irqd->parent_data,
-							early);
+							reserve);
 		if (!ret && domain->ops->activate) {
-			ret = domain->ops->activate(domain, irqd, early);
+			ret = domain->ops->activate(domain, irqd, reserve);
 			/* Rollback in case of error */
 			if (ret && irqd->parent_data)
 				__irq_domain_deactivate_irq(irqd->parent_data);
@@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
 /**
  * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
  *			     interrupt
- * @irq_data:	outermost irq_data associated with interrupt
+ * @irq_data:	Outermost irq_data associated with interrupt
+ * @reserve:	If set only reserve an interrupt vector instead of assigning one
  *
  * This is the second step to call domain_ops->activate to program interrupt
  * controllers, so the interrupt could actually get delivered.
  */
-int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
 {
 	int ret = 0;
 
 	if (!irqd_is_activated(irq_data))
-		ret = __irq_domain_activate_irq(irq_data, early);
+		ret = __irq_domain_activate_irq(irq_data, reserve);
 	if (!ret)
 		irqd_set_activated(irq_data);
 	return ret;

From bc976233a872c0f20f018fb1e89264a541584e25 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 10:47:22 +0100
Subject: [PATCH 165/305] genirq/msi, x86/vector: Prevent reservation mode for
 non maskable MSI

The new reservation mode for interrupts assigns a dummy vector when the
interrupt is allocated and assigns a real vector when the interrupt is
requested. The reservation mode prevents vector pressure when devices with
a large amount of queues/interrupts are initialized, but only a minimal
subset of those queues/interrupts is actually used.

This mode has an issue with MSI interrupts which cannot be masked. If the
driver is not careful or the hardware emits an interrupt before the device
irq is requestd by the driver then the interrupt ends up on the dummy
vector as a spurious interrupt which can cause malfunction of the device or
in the worst case a lockup of the machine.

Change the logic for the reservation mode so that the early activation of
MSI interrupts checks whether:

 - the device is a PCI/MSI device
 - the reservation mode of the underlying irqdomain is activated
 - PCI/MSI masking is globally enabled
 - the PCI/MSI device uses either MSI-X, which supports masking, or
   MSI with the maskbit supported.

If one of those conditions is false, then clear the reservation mode flag
in the irq data of the interrupt and invoke irq_domain_activate_irq() with
the reserve argument cleared. In the x86 vector code, clear the can_reserve
flag in the vector allocation data so a subsequent free_irq() won't create
the same situation again. The interrupt stays assigned to a real vector
until pci_disable_msi() is invoked and all allocations are undone.

Fixes: 4900be83602b ("x86/vector/msi: Switch to global reservation mode")
Reported-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Reported-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291406420.1899@nanos
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291409460.1899@nanos
---
 arch/x86/kernel/apic/vector.c | 12 +++++++++++-
 kernel/irq/msi.c              | 37 +++++++++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 52c85c8147e9..f8b03bb8e725 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -369,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd)
 	int ret;
 
 	ret = assign_irq_vector_any_locked(irqd);
-	if (!ret)
+	if (!ret) {
 		apicd->has_reserved = false;
+		/*
+		 * Core might have disabled reservation mode after
+		 * allocating the irq descriptor. Ideally this should
+		 * happen before allocation time, but that would require
+		 * completely convoluted ways of transporting that
+		 * information.
+		 */
+		if (!irqd_can_reserve(irqd))
+			apicd->can_reserve = false;
+	}
 	return ret;
 }
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 9ba954331171..2f3c4f5382cc 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,11 +339,38 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
 	return ret;
 }
 
-static bool msi_check_reservation_mode(struct msi_domain_info *info)
+/*
+ * Carefully check whether the device can use reservation mode. If
+ * reservation mode is enabled then the early activation will assign a
+ * dummy vector to the device. If the PCI/MSI device does not support
+ * masking of the entry then this can result in spurious interrupts when
+ * the device driver is not absolutely careful. But even then a malfunction
+ * of the hardware could result in a spurious interrupt on the dummy vector
+ * and render the device unusable. If the entry can be masked then the core
+ * logic will prevent the spurious interrupt and reservation mode can be
+ * used. For now reservation mode is restricted to PCI/MSI.
+ */
+static bool msi_check_reservation_mode(struct irq_domain *domain,
+				       struct msi_domain_info *info,
+				       struct device *dev)
 {
+	struct msi_desc *desc;
+
+	if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+		return false;
+
 	if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
 		return false;
-	return true;
+
+	if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+		return false;
+
+	/*
+	 * Checking the first MSI descriptor is sufficient. MSIX supports
+	 * masking and MSI does so when the maskbit is set.
+	 */
+	desc = first_msi_entry(dev);
+	return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
 }
 
 /**
@@ -394,7 +421,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	if (ops->msi_finish)
 		ops->msi_finish(&arg, 0);
 
-	can_reserve = msi_check_reservation_mode(info);
+	can_reserve = msi_check_reservation_mode(domain, info, dev);
 
 	for_each_msi_entry(desc, dev) {
 		virq = desc->irq;
@@ -412,7 +439,9 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			continue;
 
 		irq_data = irq_domain_get_irq_data(domain, desc->irq);
-		ret = irq_domain_activate_irq(irq_data, true);
+		if (!can_reserve)
+			irqd_clr_can_reserve(irq_data);
+		ret = irq_domain_activate_irq(irq_data, can_reserve);
 		if (ret)
 			goto cleanup;
 	}

From ced6d5c11d3e7b342f1a80f908e6756ebd4b8ddd Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Fri, 22 Dec 2017 15:51:12 +0100
Subject: [PATCH 166/305] timers: Use deferrable base independent of
 base::nohz_active

During boot and before base::nohz_active is set in the timer bases, deferrable
timers are enqueued into the standard timer base. This works correctly as
long as base::nohz_active is false.

Once it base::nohz_active is set and a timer which was enqueued before that
is accessed the lock selector code choses the lock of the deferred
base. This causes unlocked access to the standard base and in case the
timer is removed it does not clear the pending flag in the standard base
bitmap which causes get_next_timer_interrupt() to return bogus values.

To prevent that, the deferrable timers must be enqueued in the deferrable
base, even when base::nohz_active is not set. Those deferrable timers also
need to be expired unconditional.

Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel")
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Cc: rt@linutronix.de
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Link: https://lkml.kernel.org/r/20171222145337.633328378@linutronix.de
---
 kernel/time/timer.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ffebcf878fba..19a9c3da7698 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -823,11 +823,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
 	struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
 
 	/*
-	 * If the timer is deferrable and nohz is active then we need to use
-	 * the deferrable base.
+	 * If the timer is deferrable and NO_HZ_COMMON is set then we need
+	 * to use the deferrable base.
 	 */
-	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
-	    (tflags & TIMER_DEFERRABLE))
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
 		base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
 	return base;
 }
@@ -837,11 +836,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
 	/*
-	 * If the timer is deferrable and nohz is active then we need to use
-	 * the deferrable base.
+	 * If the timer is deferrable and NO_HZ_COMMON is set then we need
+	 * to use the deferrable base.
 	 */
-	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
-	    (tflags & TIMER_DEFERRABLE))
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
 		base = this_cpu_ptr(&timer_bases[BASE_DEF]);
 	return base;
 }
@@ -1684,7 +1682,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
 	base->must_forward_clk = false;
 
 	__run_timers(base);
-	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
 		__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
 }
 

From 26456f87aca7157c057de65c9414b37f1ab881d1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Dec 2017 21:37:25 +0100
Subject: [PATCH 167/305] timers: Reinitialize per cpu bases on hotplug

The timer wheel bases are not (re)initialized on CPU hotplug. That leaves
them with a potentially stale clk and next_expiry valuem, which can cause
trouble then the CPU is plugged.

Add a prepare callback which forwards the clock, sets next_expiry to far in
the future and reset the control flags to a known state.

Set base->must_forward_clk so the first timer which is queued will try to
forward the clock to current jiffies.

Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel")
Reported-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272152200.2431@nanos
---
 include/linux/cpuhotplug.h |  2 +-
 include/linux/timer.h      |  4 +++-
 kernel/cpu.c               |  4 ++--
 kernel/time/timer.c        | 15 +++++++++++++++
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 201ab7267986..1a32e558eb11 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -86,7 +86,7 @@ enum cpuhp_state {
 	CPUHP_MM_ZSWP_POOL_PREPARE,
 	CPUHP_KVM_PPC_BOOK3S_PREPARE,
 	CPUHP_ZCOMP_PREPARE,
-	CPUHP_TIMERS_DEAD,
+	CPUHP_TIMERS_PREPARE,
 	CPUHP_MIPS_SOC_PREPARE,
 	CPUHP_BP_PREPARE_DYN,
 	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 04af640ea95b..2448f9cc48a3 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -207,9 +207,11 @@ unsigned long round_jiffies_up(unsigned long j);
 unsigned long round_jiffies_up_relative(unsigned long j);
 
 #ifdef CONFIG_HOTPLUG_CPU
+int timers_prepare_cpu(unsigned int cpu);
 int timers_dead_cpu(unsigned int cpu);
 #else
-#define timers_dead_cpu NULL
+#define timers_prepare_cpu	NULL
+#define timers_dead_cpu		NULL
 #endif
 
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 41376c3ac93b..97858477e586 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 	 * before blk_mq_queue_reinit_notify() from notify_dead(),
 	 * otherwise a RCU stall occurs.
 	 */
-	[CPUHP_TIMERS_DEAD] = {
+	[CPUHP_TIMERS_PREPARE] = {
 		.name			= "timers:dead",
-		.startup.single		= NULL,
+		.startup.single		= timers_prepare_cpu,
 		.teardown.single	= timers_dead_cpu,
 	},
 	/* Kicks the plugged cpu into life */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 19a9c3da7698..6be576e02209 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1853,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
 	}
 }
 
+int timers_prepare_cpu(unsigned int cpu)
+{
+	struct timer_base *base;
+	int b;
+
+	for (b = 0; b < NR_BASES; b++) {
+		base = per_cpu_ptr(&timer_bases[b], cpu);
+		base->clk = jiffies;
+		base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+		base->is_idle = false;
+		base->must_forward_clk = true;
+	}
+	return 0;
+}
+
 int timers_dead_cpu(unsigned int cpu)
 {
 	struct timer_base *old_base;

From 5d62c183f9e9df1deeea0906d099a94e8a43047a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Dec 2017 15:51:13 +0100
Subject: [PATCH 168/305] nohz: Prevent a timer interrupt storm in
 tick_nohz_stop_sched_tick()

The conditions in irq_exit() to invoke tick_nohz_irq_exit() which
subsequently invokes tick_nohz_stop_sched_tick() are:

  if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu))

If need_resched() is not set, but a timer softirq is pending then this is
an indication that the softirq code punted and delegated the execution to
softirqd. need_resched() is not true because the current interrupted task
takes precedence over softirqd.

Invoking tick_nohz_irq_exit() in this case can cause an endless loop of
timer interrupts because the timer wheel contains an expired timer, but
softirqs are not yet executed. So it returns an immediate expiry request,
which causes the timer to fire immediately again. Lather, rinse and
repeat....

Prevent that by adding a check for a pending timer soft interrupt to the
conditions in tick_nohz_stop_sched_tick() which avoid calling
get_next_timer_interrupt(). That keeps the tick sched timer on the tick and
prevents a repetitive programming of an already expired timer.

Reported-by: Sebastian Siewior <bigeasy@linutronix.d>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272156050.2431@nanos
---
 kernel/time/tick-sched.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 77555faf6fbc..f7cc7abfcf25 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 	ts->next_tick = 0;
 }
 
+static inline bool local_timer_softirq_pending(void)
+{
+	return local_softirq_pending() & TIMER_SOFTIRQ;
+}
+
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 					 ktime_t now, int cpu)
 {
@@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	} while (read_seqretry(&jiffies_lock, seq));
 	ts->last_jiffies = basejiff;
 
-	if (rcu_needs_cpu(basemono, &next_rcu) ||
-	    arch_needs_cpu() || irq_work_needs_cpu()) {
+	/*
+	 * Keep the periodic tick, when RCU, architecture or irq_work
+	 * requests it.
+	 * Aside of that check whether the local timer softirq is
+	 * pending. If so its a bad idea to call get_next_timer_interrupt()
+	 * because there is an already expired timer, so it will request
+	 * immeditate expiry, which rearms the hardware timer with a
+	 * minimal delta which brings us back to this place
+	 * immediately. Lather, rinse and repeat...
+	 */
+	if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+	    irq_work_needs_cpu() || local_timer_softirq_pending()) {
 		next_tick = basemono + TICK_NSEC;
 	} else {
 		/*

From fd45bb77ad682be728d1002431d77b8c73342836 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Dec 2017 15:51:14 +0100
Subject: [PATCH 169/305] timers: Invoke timer_start_debug() where it makes
 sense

The timer start debug function is called before the proper timer base is
set. As a consequence the trace data contains the stale CPU and flags
values.

Call the debug function after setting the new base and flags.

Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Cc: rt@linutronix.de
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Link: https://lkml.kernel.org/r/20171222145337.792907137@linutronix.de
---
 kernel/time/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 6be576e02209..89a9e1b4264a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1007,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
 	if (!ret && (options & MOD_TIMER_PENDING_ONLY))
 		goto out_unlock;
 
-	debug_activate(timer, expires);
-
 	new_base = get_target_base(base, timer->flags);
 
 	if (base != new_base) {
@@ -1032,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
 		}
 	}
 
+	debug_activate(timer, expires);
+
 	timer->expires = expires;
 	/*
 	 * If 'idx' was calculated above and the base time did not advance

From 9f4533cd7334235cd4c9b9fb1b0b8791e2ba01a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Dec 2017 15:51:15 +0100
Subject: [PATCH 170/305] timerqueue: Document return values of
 timerqueue_add/del()

The return values of timerqueue_add/del() are not documented in the kernel doc
comment. Add proper documentation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: rt@linutronix.de
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Link: https://lkml.kernel.org/r/20171222145337.872681338@linutronix.de
---
 lib/timerqueue.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index 4a720ed4fdaf..0d54bcbc8170 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -33,8 +33,9 @@
  * @head: head of timerqueue
  * @node: timer node to be added
  *
- * Adds the timer node to the timerqueue, sorted by the
- * node's expires value.
+ * Adds the timer node to the timerqueue, sorted by the node's expires
+ * value. Returns true if the newly added timer is the first expiring timer in
+ * the queue.
  */
 bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 {
@@ -70,7 +71,8 @@ EXPORT_SYMBOL_GPL(timerqueue_add);
  * @head: head of timerqueue
  * @node: timer node to be removed
  *
- * Removes the timer node from the timerqueue.
+ * Removes the timer node from the timerqueue. Returns true if the queue is
+ * not empty after the remove.
  */
 bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 {

From 3ce120b16cc548472f80cf8644f90eda958cf1b6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 29 Dec 2017 17:34:43 -0800
Subject: [PATCH 171/305] kbuild: add '-fno-stack-check' to kernel build
 options
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It appears that hardened gentoo enables "-fstack-check" by default for
gcc.

That doesn't work _at_all_ for the kernel, because the kernel stack
doesn't act like a user stack at all: it's much smaller, and it doesn't
auto-expand on use.  So the extra "probe one page below the stack" code
generated by -fstack-check just breaks the kernel in horrible ways,
causing infinite double faults etc.

[ I have to say, that the particular code gcc generates looks very
  stupid even for user space where it works, but that's a separate
  issue.  ]

Reported-and-tested-by: Alexander Tsoy <alexander@tsoy.me>
Reported-and-tested-by: Toralf Förster <toralf.foerster@gmx.de>
Cc: stable@kernel.org
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Makefile b/Makefile
index ac8c441866b7..92b74bcd3c2a 100644
--- a/Makefile
+++ b/Makefile
@@ -789,6 +789,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
 # disable invalid "can't wrap" optimizations for signed / pointers
 KBUILD_CFLAGS	+= $(call cc-option,-fno-strict-overflow)
 
+# Make sure -fstack-check isn't enabled (like gentoo apparently did)
+KBUILD_CFLAGS  += $(call cc-option,-fno-stack-check,)
+
 # conserve stack if available
 KBUILD_CFLAGS   += $(call cc-option,-fconserve-stack)
 

From d89e426499cf36b96161bd32970d6783f1fbcb0e Mon Sep 17 00:00:00 2001
From: Simon Ser <contact@emersion.fr>
Date: Sat, 30 Dec 2017 14:43:31 -0600
Subject: [PATCH 172/305] objtool: Fix seg fault caused by missing parameter

Fix a seg fault when no parameter is provided to 'objtool orc'.

Signed-off-by: Simon Ser <contact@emersion.fr>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/9172803ec7ebb72535bcd0b7f966ae96d515968e.1514666459.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/builtin-orc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c
index 4c6b5c9ef073..91e8e19ff5e0 100644
--- a/tools/objtool/builtin-orc.c
+++ b/tools/objtool/builtin-orc.c
@@ -44,6 +44,9 @@ int cmd_orc(int argc, const char **argv)
 	const char *objname;
 
 	argc--; argv++;
+	if (argc <= 0)
+		usage_with_options(orc_usage, check_options);
+
 	if (!strncmp(argv[0], "gen", 3)) {
 		argc = parse_options(argc, argv, check_options, orc_usage, 0);
 		if (argc != 1)
@@ -52,7 +55,6 @@ int cmd_orc(int argc, const char **argv)
 		objname = argv[0];
 
 		return check(objname, no_fp, no_unreachable, true);
-
 	}
 
 	if (!strcmp(argv[0], "dump")) {

From ce90aaf5cde4ce057b297bb6c955caf16ef00ee6 Mon Sep 17 00:00:00 2001
From: Simon Ser <contact@emersion.fr>
Date: Sat, 30 Dec 2017 14:43:32 -0600
Subject: [PATCH 173/305] objtool: Fix seg fault with clang-compiled objects

Fix a seg fault which happens when an input file provided to 'objtool
orc generate' doesn't have a '.shstrtab' section (for instance, object
files produced by clang don't have this section).

Signed-off-by: Simon Ser <contact@emersion.fr>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c0f2231683e9bed40fac1f13ce2c33b8389854bc.1514666459.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/orc_gen.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index e5ca31429c9b..e61fe703197b 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -165,6 +165,8 @@ int create_orc_sections(struct objtool_file *file)
 
 	/* create .orc_unwind_ip and .rela.orc_unwind_ip sections */
 	sec = elf_create_section(file->elf, ".orc_unwind_ip", sizeof(int), idx);
+	if (!sec)
+		return -1;
 
 	ip_relasec = elf_create_rela_section(file->elf, sec);
 	if (!ip_relasec)

From 322f8b8b340c824aef891342b0f5795d15e11562 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 30 Dec 2017 22:13:53 +0100
Subject: [PATCH 174/305] x86/smpboot: Remove stale TLB flush invocations

smpboot_setup_warm_reset_vector() and smpboot_restore_warm_reset_vector()
invoke local_flush_tlb() for no obvious reason.

Digging in history revealed that the original code in the 2.1 era added
those because the code manipulated a swapper_pg_dir pagetable entry. The
pagetable manipulation was removed long ago in the 2.3 timeframe, but the
TLB flush invocations stayed around forever.

Remove them along with the pointless pr_debug()s which come from the same 2.1
change.

Reported-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171230211829.586548655@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 33d6000265aa..c3402fc30865 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -128,25 +128,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 	spin_lock_irqsave(&rtc_lock, flags);
 	CMOS_WRITE(0xa, 0xf);
 	spin_unlock_irqrestore(&rtc_lock, flags);
-	local_flush_tlb();
-	pr_debug("1.\n");
 	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
 							start_eip >> 4;
-	pr_debug("2.\n");
 	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
 							start_eip & 0xf;
-	pr_debug("3.\n");
 }
 
 static inline void smpboot_restore_warm_reset_vector(void)
 {
 	unsigned long flags;
 
-	/*
-	 * Install writable page 0 entry to set BIOS data area.
-	 */
-	local_flush_tlb();
-
 	/*
 	 * Paranoid:  Set warm reset code and vector here back
 	 * to default values.

From decab0888e6e14e11d53cefa85f8b3d3b45ce73c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 30 Dec 2017 22:13:54 +0100
Subject: [PATCH 175/305] x86/mm: Remove preempt_disable/enable() from
 __native_flush_tlb()

The preempt_disable/enable() pair in __native_flush_tlb() was added in
commit:

  5cf0791da5c1 ("x86/mm: Disable preemption during CR3 read+write")

... to protect the UP variant of flush_tlb_mm_range().

That preempt_disable/enable() pair should have been added to the UP variant
of flush_tlb_mm_range() instead.

The UP variant was removed with commit:

  ce4a4e565f52 ("x86/mm: Remove the UP asm/tlbflush.h code, always use the (formerly) SMP code")

... but the preempt_disable/enable() pair stayed around.

The latest change to __native_flush_tlb() in commit:

  6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches")

... added an access to a per CPU variable outside the preempt disabled
regions, which makes no sense at all. __native_flush_tlb() must always
be called with at least preemption disabled.

Remove the preempt_disable/enable() pair and add a WARN_ON_ONCE() to catch
bad callers independent of the smp_processor_id() debugging.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171230211829.679325424@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index b519da4fc03c..f9b48ce152eb 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -345,15 +345,17 @@ static inline void invalidate_user_asid(u16 asid)
  */
 static inline void __native_flush_tlb(void)
 {
-	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 	/*
-	 * If current->mm == NULL then we borrow a mm which may change
-	 * during a task switch and therefore we must not be preempted
-	 * while we write CR3 back:
+	 * Preemption or interrupts must be disabled to protect the access
+	 * to the per CPU variable and to prevent being preempted between
+	 * read_cr3() and write_cr3().
 	 */
-	preempt_disable();
+	WARN_ON_ONCE(preemptible());
+
+	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+	/* If current->mm == NULL then the read_cr3() "borrows" an mm */
 	native_write_cr3(__native_read_cr3());
-	preempt_enable();
 }
 
 /*

From a62d69857aab4caa43049e72fe0ed5c4a60518dd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Dec 2017 11:24:34 +0100
Subject: [PATCH 176/305] x86/ldt: Plug memory leak in error path

The error path in write_ldt() tries to free 'old_ldt' instead of the newly
allocated 'new_ldt', resulting in a memory leak. It also misses to clean up a
half populated LDT pagetable, which is not a leak as it gets cleaned up
when the process exits.

Free both the potentially half populated LDT pagetable and the newly
allocated LDT struct. This can be done unconditionally because once an LDT
is mapped subsequent maps will succeed, because the PTE page is already
populated and the two LDTs fit into that single page.

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: f55f0501cbf6 ("x86/pti: Put the LDT in its own PGD if PTI is on")
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1712311121340.1899@nanos
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/ldt.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 579cc4a66fdf..500e90e44f86 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -421,7 +421,13 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 	 */
 	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
 	if (error) {
-		free_ldt_struct(old_ldt);
+		/*
+		 * This only can fail for the first LDT setup. If an LDT is
+		 * already installed then the PTE page is already
+		 * populated. Mop up a half populated page table.
+		 */
+		free_ldt_pgtables(mm);
+		free_ldt_struct(new_ldt);
 		goto out_unlock;
 	}
 

From 7f414195b0c3612acd12b4611a5fe75995cf10c7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Dec 2017 16:52:15 +0100
Subject: [PATCH 177/305] x86/ldt: Make LDT pgtable free conditional

Andy prefers to be paranoid about the pagetable free in the error path of
write_ldt(). Make it conditional and warn whenever the installment of a
secondary LDT fails.

Requested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ldt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 500e90e44f86..26d713ecad34 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -426,7 +426,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 		 * already installed then the PTE page is already
 		 * populated. Mop up a half populated page table.
 		 */
-		free_ldt_pgtables(mm);
+		if (!WARN_ON_ONCE(old_ldt))
+			free_ldt_pgtables(mm);
 		free_ldt_struct(new_ldt);
 		goto out_unlock;
 	}

From c0b23903f5b077effec90769d365646a8c2faae0 Mon Sep 17 00:00:00 2001
From: Adam Borowski <kilobyte@angband.pl>
Date: Mon, 25 Dec 2017 16:38:58 +0100
Subject: [PATCH 178/305] MAINTAINERS: mark arch/blackfin/ and its gubbins as
 orphaned

The blackfin architecture has seen no maintainer action of any kind since
April 2015.  No new code, no pull requests, no acks to patches, no response
to mails, nothing.

The web site has an expired certificate (expiration Sep 2017, issued in
2013), the mailing list sees no answers either, with one exception:

  https://sourceforge.net/p/adi-buildroot/mailman/adi-buildroot-devel/
  >
  > Steven is no longer working on this for ADI. Acked by me if this works. Thanks.
  >
  > Best regards,
  > Aaron Wu
  > Analog Devices Inc.

But, Aaron doesn't seem to respond to queries either.

Signed-off-by: Adam Borowski <kilobyte@angband.pl>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index a6e86e20761e..2d0773007c89 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2621,24 +2621,22 @@ F:	fs/bfs/
 F:	include/uapi/linux/bfs_fs.h
 
 BLACKFIN ARCHITECTURE
-M:	Steven Miao <realmz6@gmail.com>
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 T:	git git://git.code.sf.net/p/adi-linux/code
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	arch/blackfin/
 
 BLACKFIN EMAC DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/net/ethernet/adi/
 
 BLACKFIN MEDIA DRIVER
-M:	Scott Jiang <scott.jiang.linux@gmail.com>
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org/
-S:	Supported
+S:	Orphan
 F:	drivers/media/platform/blackfin/
 F:	drivers/media/i2c/adv7183*
 F:	drivers/media/i2c/vs6624*
@@ -2646,25 +2644,25 @@ F:	drivers/media/i2c/vs6624*
 BLACKFIN RTC DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/rtc/rtc-bfin.c
 
 BLACKFIN SDH DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/mmc/host/bfin_sdh.c
 
 BLACKFIN SERIAL DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/tty/serial/bfin_uart.c
 
 BLACKFIN WATCHDOG DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/watchdog/bfin_wdt.c
 
 BLINKM RGB LED DRIVER

From 30a7acd573899fd8b8ac39236eff6468b195ac7d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 31 Dec 2017 14:47:43 -0800
Subject: [PATCH 179/305] Linux 4.15-rc6

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 92b74bcd3c2a..eb1f5973813e 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 15
 SUBLEVEL = 0
-EXTRAVERSION = -rc5
+EXTRAVERSION = -rc6
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*

From dc32b5c3e6e2ef29cef76d9ce1b92d394446150e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 1 Jan 2018 09:28:31 -0600
Subject: [PATCH 180/305] capabilities: fix buffer overread on very short xattr

If userspace attempted to set a "security.capability" xattr shorter than
4 bytes (e.g. 'setfattr -n security.capability -v x file'), then
cap_convert_nscap() read past the end of the buffer containing the xattr
value because it accessed the ->magic_etc field without verifying that
the xattr value is long enough to contain that field.

Fix it by validating the xattr value size first.

This bug was found using syzkaller with KASAN.  The KASAN report was as
follows (cleaned up slightly):

    BUG: KASAN: slab-out-of-bounds in cap_convert_nscap+0x514/0x630 security/commoncap.c:498
    Read of size 4 at addr ffff88002d8741c0 by task syz-executor1/2852

    CPU: 0 PID: 2852 Comm: syz-executor1 Not tainted 4.15.0-rc6-00200-gcc0aac99d977 #253
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014
    Call Trace:
     __dump_stack lib/dump_stack.c:17 [inline]
     dump_stack+0xe3/0x195 lib/dump_stack.c:53
     print_address_description+0x73/0x260 mm/kasan/report.c:252
     kasan_report_error mm/kasan/report.c:351 [inline]
     kasan_report+0x235/0x350 mm/kasan/report.c:409
     cap_convert_nscap+0x514/0x630 security/commoncap.c:498
     setxattr+0x2bd/0x350 fs/xattr.c:446
     path_setxattr+0x168/0x1b0 fs/xattr.c:472
     SYSC_setxattr fs/xattr.c:487 [inline]
     SyS_setxattr+0x36/0x50 fs/xattr.c:483
     entry_SYSCALL_64_fastpath+0x18/0x85

Fixes: 8db6c34f1dbc ("Introduce v3 namespaced file capabilities")
Cc: <stable@vger.kernel.org> # v4.14+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 security/commoncap.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/security/commoncap.c b/security/commoncap.c
index 4f8e09340956..48620c93d697 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -348,21 +348,18 @@ static __u32 sansflags(__u32 m)
 	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
 }
 
-static bool is_v2header(size_t size, __le32 magic)
+static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
 {
-	__u32 m = le32_to_cpu(magic);
 	if (size != XATTR_CAPS_SZ_2)
 		return false;
-	return sansflags(m) == VFS_CAP_REVISION_2;
+	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
 }
 
-static bool is_v3header(size_t size, __le32 magic)
+static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
 {
-	__u32 m = le32_to_cpu(magic);
-
 	if (size != XATTR_CAPS_SZ_3)
 		return false;
-	return sansflags(m) == VFS_CAP_REVISION_3;
+	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
 }
 
 /*
@@ -405,7 +402,7 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 
 	fs_ns = inode->i_sb->s_user_ns;
 	cap = (struct vfs_cap_data *) tmpbuf;
-	if (is_v2header((size_t) ret, cap->magic_etc)) {
+	if (is_v2header((size_t) ret, cap)) {
 		/* If this is sizeof(vfs_cap_data) then we're ok with the
 		 * on-disk value, so return that.  */
 		if (alloc)
@@ -413,7 +410,7 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 		else
 			kfree(tmpbuf);
 		return ret;
-	} else if (!is_v3header((size_t) ret, cap->magic_etc)) {
+	} else if (!is_v3header((size_t) ret, cap)) {
 		kfree(tmpbuf);
 		return -EINVAL;
 	}
@@ -470,9 +467,9 @@ static kuid_t rootid_from_xattr(const void *value, size_t size,
 	return make_kuid(task_ns, rootid);
 }
 
-static bool validheader(size_t size, __le32 magic)
+static bool validheader(size_t size, const struct vfs_cap_data *cap)
 {
-	return is_v2header(size, magic) || is_v3header(size, magic);
+	return is_v2header(size, cap) || is_v3header(size, cap);
 }
 
 /*
@@ -495,7 +492,7 @@ int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
 
 	if (!*ivalue)
 		return -EINVAL;
-	if (!validheader(size, cap->magic_etc))
+	if (!validheader(size, cap))
 		return -EINVAL;
 	if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
 		return -EPERM;

From 98801506552593c9b8ac11021b0cdad12cab4f6b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 2 Jan 2018 10:02:19 +0000
Subject: [PATCH 181/305] fscache: Fix the default for
 fscache_maybe_release_page()

Fix the default for fscache_maybe_release_page() for when the cookie isn't
valid or the page isn't cached.  It mustn't return false as that indicates
the page cannot yet be freed.

The problem with the default is that if, say, there's no cache, but a
network filesystem's pages are using up almost all the available memory, a
system can OOM because the filesystem ->releasepage() op will not allow
them to be released as fscache_maybe_release_page() incorrectly prevents
it.

This can be tested by writing a sequence of 512MiB files to an AFS mount.
It does not affect NFS or CIFS because both of those wrap the call in a
check of PG_fscache and it shouldn't bother Ceph as that only has
PG_private set whilst writeback is in progress.  This might be an issue for
9P, however.

Note that the pages aren't entirely stuck.  Removing a file or unmounting
will clear things because that uses ->invalidatepage() instead.

Fixes: 201a15428bd5 ("FS-Cache: Handle pages pending storage that get evicted under OOM conditions")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Marc Dionne <marc.dionne@auristor.com>
cc: stable@vger.kernel.org # 2.6.32+
---
 include/linux/fscache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index f4ff47d4a893..fe0c349684fa 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -755,7 +755,7 @@ bool fscache_maybe_release_page(struct fscache_cookie *cookie,
 {
 	if (fscache_cookie_valid(cookie) && PageFsCache(page))
 		return __fscache_maybe_release_page(cookie, page, gfp);
-	return false;
+	return true;
 }
 
 /**

From 7888da95832d50a87bbfdb9f40620ddc66f94b3c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 2 Jan 2018 10:02:19 +0000
Subject: [PATCH 182/305] afs: Potential uninitialized variable in
 afs_extract_data()

Smatch warns that:

    fs/afs/rxrpc.c:922 afs_extract_data()
    error: uninitialized symbol 'remote_abort'.

Smatch is right that "remote_abort" might be uninitialized when we pass
it to afs_set_call_complete().  I don't know if that function uses the
uninitialized variable.  Anyway, the comment for rxrpc_kernel_recv_data(),
says that "*_abort should also be initialised to 0." and this patch does
that.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rxrpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index ea1460b9b71a..e1126659f043 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -885,7 +885,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 {
 	struct afs_net *net = call->net;
 	enum afs_call_state state;
-	u32 remote_abort;
+	u32 remote_abort = 0;
 	int ret;
 
 	_enter("{%s,%zu},,%zu,%d",

From 440fbc3a8a694467ba641234cedb96c28ab2d5fb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 2 Jan 2018 10:02:19 +0000
Subject: [PATCH 183/305] afs: Fix unlink

Repeating creation and deletion of a file on an afs mount will run the box
out of memory, e.g.:

	dd if=/dev/zero of=/afs/scratch/m0 bs=$((1024*1024)) count=512
	rm /afs/scratch/m0

The problem seems to be that it's not properly decrementing the nlink count
so that the inode can be scrapped.

Note that this doesn't fix local creation followed by remote deletion.
That's harder to handle and will require a separate patch as we're not told
that the file has been deleted - only that the directory has changed.

Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c   | 37 +++++++++++++++++++++++++++++--------
 fs/afs/inode.c |  4 ++++
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index ff8d5bf4354f..23c7f395d718 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -895,20 +895,38 @@ error:
  * However, if we didn't have a callback promise outstanding, or it was
  * outstanding on a different server, then it won't break it either...
  */
-static int afs_dir_remove_link(struct dentry *dentry, struct key *key)
+static int afs_dir_remove_link(struct dentry *dentry, struct key *key,
+			       unsigned long d_version_before,
+			       unsigned long d_version_after)
 {
+	bool dir_valid;
 	int ret = 0;
 
+	/* There were no intervening changes on the server if the version
+	 * number we got back was incremented by exactly 1.
+	 */
+	dir_valid = (d_version_after == d_version_before + 1);
+
 	if (d_really_is_positive(dentry)) {
 		struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
 
-		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-			kdebug("AFS_VNODE_DELETED");
-		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
-
-		ret = afs_validate(vnode, key);
-		if (ret == -ESTALE)
+		if (dir_valid) {
+			drop_nlink(&vnode->vfs_inode);
+			if (vnode->vfs_inode.i_nlink == 0) {
+				set_bit(AFS_VNODE_DELETED, &vnode->flags);
+				clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+			}
 			ret = 0;
+		} else {
+			clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+
+			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+				kdebug("AFS_VNODE_DELETED");
+
+			ret = afs_validate(vnode, key);
+			if (ret == -ESTALE)
+				ret = 0;
+		}
 		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
 	}
 
@@ -923,6 +941,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 	struct afs_fs_cursor fc;
 	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
 	struct key *key;
+	unsigned long d_version = (unsigned long)dentry->d_fsdata;
 	int ret;
 
 	_enter("{%x:%u},{%pd}",
@@ -955,7 +974,9 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
 		ret = afs_end_vnode_operation(&fc);
 		if (ret == 0)
-			ret = afs_dir_remove_link(dentry, key);
+			ret = afs_dir_remove_link(
+				dentry, key, d_version,
+				(unsigned long)dvnode->status.data_version);
 	}
 
 error_key:
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 3415eb7484f6..1e81864ef0b2 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -377,6 +377,10 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	}
 
 	read_sequnlock_excl(&vnode->cb_lock);
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		clear_nlink(&vnode->vfs_inode);
+
 	if (valid)
 		goto valid;
 

From afae457d874860a7e299d334f59eede5f3ad4b47 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 2 Jan 2018 10:02:19 +0000
Subject: [PATCH 184/305] afs: Fix missing error handling in afs_write_end()

afs_write_end() is missing page unlock and put if afs_fill_page() fails.

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/write.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/afs/write.c b/fs/afs/write.c
index cb5f8a3df577..9370e2feb999 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -198,7 +198,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 			ret = afs_fill_page(vnode, key, pos + copied,
 					    len - copied, page);
 			if (ret < 0)
-				return ret;
+				goto out;
 		}
 		SetPageUptodate(page);
 	}
@@ -206,10 +206,12 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 	set_page_dirty(page);
 	if (PageDirty(page))
 		_debug("dirtied");
+	ret = copied;
+
+out:
 	unlock_page(page);
 	put_page(page);
-
-	return copied;
+	return ret;
 }
 
 /*

From ecb101aed86156ec7cd71e5dca668e09146e6994 Mon Sep 17 00:00:00 2001
From: John Sperbeck <jsperbeck@google.com>
Date: Sun, 31 Dec 2017 21:24:58 -0800
Subject: [PATCH 185/305] powerpc/mm: Fix SEGV on mapped region to return
 SEGV_ACCERR

The recent refactoring of the powerpc page fault handler in commit
c3350602e876 ("powerpc/mm: Make bad_area* helper functions") caused
access to protected memory regions to indicate SEGV_MAPERR instead of
the traditional SEGV_ACCERR in the si_code field of a user-space
signal handler. This can confuse debug libraries that temporarily
change the protection of memory regions, and expect to use SEGV_ACCERR
as an indication to restore access to a region.

This commit restores the previous behavior. The following program
exhibits the issue:

    $ ./repro read  || echo "FAILED"
    $ ./repro write || echo "FAILED"
    $ ./repro exec  || echo "FAILED"

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <unistd.h>
    #include <signal.h>
    #include <sys/mman.h>
    #include <assert.h>

    static void segv_handler(int n, siginfo_t *info, void *arg) {
            _exit(info->si_code == SEGV_ACCERR ? 0 : 1);
    }

    int main(int argc, char **argv)
    {
            void *p = NULL;
            struct sigaction act = {
                    .sa_sigaction = segv_handler,
                    .sa_flags = SA_SIGINFO,
            };

            assert(argc == 2);
            p = mmap(NULL, getpagesize(),
                    (strcmp(argv[1], "write") == 0) ? PROT_READ : 0,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
            assert(p != MAP_FAILED);

            assert(sigaction(SIGSEGV, &act, NULL) == 0);
            if (strcmp(argv[1], "read") == 0)
                    printf("%c", *(unsigned char *)p);
            else if (strcmp(argv[1], "write") == 0)
                    *(unsigned char *)p = 0;
            else if (strcmp(argv[1], "exec") == 0)
                    ((void (*)(void))p)();
            return 1;  /* failed to generate SEGV */
    }

Fixes: c3350602e876 ("powerpc/mm: Make bad_area* helper functions")
Cc: stable@vger.kernel.org # v4.14+
Signed-off-by: John Sperbeck <jsperbeck@google.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Add commit references in change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/fault.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4797d08581ce..6e1e39035380 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -145,6 +145,11 @@ static noinline int bad_area(struct pt_regs *regs, unsigned long address)
 	return __bad_area(regs, address, SEGV_MAPERR);
 }
 
+static noinline int bad_access(struct pt_regs *regs, unsigned long address)
+{
+	return __bad_area(regs, address, SEGV_ACCERR);
+}
+
 static int do_sigbus(struct pt_regs *regs, unsigned long address,
 		     unsigned int fault)
 {
@@ -490,7 +495,7 @@ retry:
 
 good_area:
 	if (unlikely(access_error(is_write, is_exec, vma)))
-		return bad_area(regs, address);
+		return bad_access(regs, address);
 
 	/*
 	 * If for any reason at all we couldn't handle the fault,

From e0093a89f2386f12cc87047b43e93c3c6e15e94e Mon Sep 17 00:00:00 2001
From: Dhinakaran Pandiyan <dhinakaran.pandiyan@gmail.com>
Date: Tue, 19 Dec 2017 20:35:20 -0800
Subject: [PATCH 186/305] drm/i915/psr: Fix register name mess up.

Commit 77affa31722b ("drm/i915/psr: Fix compiler warnings for
hsw_psr_disable()") swapped status and control registers while fixing
indentation. The _ctl at the end of the status register name must have to
led to this.

Fixes: 77affa31722b ("drm/i915/psr: Fix compiler warnings for hsw_psr_disable()")
References: https://www.mrc-cbu.cam.ac.uk/people/matt.davis/cmabridge/
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Dhinakaran Pandiyan <dhinakaran.pandiyan@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171220043520.2599-1-dhinakaran.pandiyan@intel.com
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
(cherry picked from commit 14c6547d6df641d3e41fa4f4164f6e267ebfab89)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_psr.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_psr.c b/drivers/gpu/drm/i915/intel_psr.c
index 6e3b430fccdc..55ea5eb3b7df 100644
--- a/drivers/gpu/drm/i915/intel_psr.c
+++ b/drivers/gpu/drm/i915/intel_psr.c
@@ -590,7 +590,7 @@ static void hsw_psr_disable(struct intel_dp *intel_dp,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 
 	if (dev_priv->psr.active) {
-		i915_reg_t psr_ctl;
+		i915_reg_t psr_status;
 		u32 psr_status_mask;
 
 		if (dev_priv->psr.aux_frame_sync)
@@ -599,24 +599,24 @@ static void hsw_psr_disable(struct intel_dp *intel_dp,
 					0);
 
 		if (dev_priv->psr.psr2_support) {
-			psr_ctl = EDP_PSR2_CTL;
+			psr_status = EDP_PSR2_STATUS_CTL;
 			psr_status_mask = EDP_PSR2_STATUS_STATE_MASK;
 
-			I915_WRITE(psr_ctl,
-				   I915_READ(psr_ctl) &
+			I915_WRITE(EDP_PSR2_CTL,
+				   I915_READ(EDP_PSR2_CTL) &
 				   ~(EDP_PSR2_ENABLE | EDP_SU_TRACK_ENABLE));
 
 		} else {
-			psr_ctl = EDP_PSR_STATUS_CTL;
+			psr_status = EDP_PSR_STATUS_CTL;
 			psr_status_mask = EDP_PSR_STATUS_STATE_MASK;
 
-			I915_WRITE(psr_ctl,
-				   I915_READ(psr_ctl) & ~EDP_PSR_ENABLE);
+			I915_WRITE(EDP_PSR_CTL,
+				   I915_READ(EDP_PSR_CTL) & ~EDP_PSR_ENABLE);
 		}
 
 		/* Wait till PSR is idle */
 		if (intel_wait_for_register(dev_priv,
-					    psr_ctl, psr_status_mask, 0,
+					    psr_status, psr_status_mask, 0,
 					    2000))
 			DRM_ERROR("Timed out waiting for PSR Idle State\n");
 

From 3488d0237f6364614f0c59d6d784bb79b11eeb92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Fri, 8 Dec 2017 23:37:36 +0200
Subject: [PATCH 187/305] drm/i915: Disable DC states around GMBUS on GLK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prevent the DMC from destroying GMBUS transfers on GLK. GMBUS
lives in PG1 so DC off is all we need.

Cc: stable@vger.kernel.org
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171208213739.16388-1-ville.syrjala@linux.intel.com
Reviewed-by: Dhinakaran Pandiyan <dhinakaran.pandiyan@intel.com>
(cherry picked from commit 156961ae7bdf6feb72778e8da83d321b273343fd)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_runtime_pm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 8af286c63d3b..9bf46ab211cb 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -1786,6 +1786,7 @@ void intel_display_power_put(struct drm_i915_private *dev_priv,
 	GLK_DISPLAY_POWERWELL_2_POWER_DOMAINS |		\
 	BIT_ULL(POWER_DOMAIN_MODESET) |			\
 	BIT_ULL(POWER_DOMAIN_AUX_A) |			\
+	BIT_ULL(POWER_DOMAIN_GMBUS) |			\
 	BIT_ULL(POWER_DOMAIN_INIT))
 
 #define CNL_DISPLAY_POWERWELL_2_POWER_DOMAINS (		\

From eda41bdc571e5c51d817c2e8b4578d34a9e383f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Mon, 13 Nov 2017 15:36:22 +0200
Subject: [PATCH 188/305] drm/i915: Put all non-blocking modesets onto an
 ordered wq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have plenty of global registers and whatnot programmed without
any further locking by the modeset code. Currently non-bocking
modesets are allowed to execute in parallel which could corrupt
said registers.

To avoid the problem let's run all non-blocking modesets on an
ordered workqueue. We still put page flips etc. to system_unbound_wq
allowing page flips on one pipe to execute in parallel with page flips
or a modeset on a another pipe (assuming no known state is shared
between them, at which point they would have been added to the same
atomic commit and serialized that way).

Blocking modesets are already serialized with each other by
connection_mutex, and thus are safe. To serialize them with
non-blocking modesets we just flush the workqueue before executing
blocking modesets.

Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Fixes: 94f050246b42 ("drm/i915: nonblocking commit")
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171113133622.8593-1-ville.syrjala@linux.intel.com
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
(cherry picked from commit 757fffcfdffb6c0dd46c1b264091c36b4e5a86ae)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h      |  3 +++
 drivers/gpu/drm/i915/intel_display.c | 14 +++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 54b5d4c582b6..e143004e66d5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2368,6 +2368,9 @@ struct drm_i915_private {
 	 */
 	struct workqueue_struct *wq;
 
+	/* ordered wq for modesets */
+	struct workqueue_struct *modeset_wq;
+
 	/* Display functions */
 	struct drm_i915_display_funcs display;
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 30cf273d57aa..123585eeb87d 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -12544,11 +12544,15 @@ static int intel_atomic_commit(struct drm_device *dev,
 	INIT_WORK(&state->commit_work, intel_atomic_commit_work);
 
 	i915_sw_fence_commit(&intel_state->commit_ready);
-	if (nonblock)
+	if (nonblock && intel_state->modeset) {
+		queue_work(dev_priv->modeset_wq, &state->commit_work);
+	} else if (nonblock) {
 		queue_work(system_unbound_wq, &state->commit_work);
-	else
+	} else {
+		if (intel_state->modeset)
+			flush_workqueue(dev_priv->modeset_wq);
 		intel_atomic_commit_tail(state);
-
+	}
 
 	return 0;
 }
@@ -14462,6 +14466,8 @@ int intel_modeset_init(struct drm_device *dev)
 	enum pipe pipe;
 	struct intel_crtc *crtc;
 
+	dev_priv->modeset_wq = alloc_ordered_workqueue("i915_modeset", 0);
+
 	drm_mode_config_init(dev);
 
 	dev->mode_config.min_width = 0;
@@ -15270,6 +15276,8 @@ void intel_modeset_cleanup(struct drm_device *dev)
 	intel_cleanup_gt_powersave(dev_priv);
 
 	intel_teardown_gmbus(dev_priv);
+
+	destroy_workqueue(dev_priv->modeset_wq);
 }
 
 void intel_connector_attach_encoder(struct intel_connector *connector,

From c1f08c419764439bfa2d3f33d2fdef9d7013fc47 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 3 Dec 2017 15:36:20 -0800
Subject: [PATCH 189/305] documentation/gpu/i915: fix docs build error after
 file rename

Fix documentation build errors after intel_guc_loader.c was
renamed to intel_guc_fw.c.

Error: Cannot open file ../drivers/gpu/drm/i915/intel_guc_loader.c
WARNING: kernel-doc '../scripts/kernel-doc -rst -enable-lineno -function GuC-specific firmware loader ../drivers/gpu/drm/i915/intel_guc_loader.c' failed with return code 1
Error: Cannot open file ../drivers/gpu/drm/i915/intel_guc_loader.c
Error: Cannot open file ../drivers/gpu/drm/i915/intel_guc_loader.c
WARNING: kernel-doc '../scripts/kernel-doc -rst -enable-lineno -internal ../drivers/gpu/drm/i915/intel_guc_loader.c' failed with return code 2

Fixes: e8668bbcb0f9 ("drm/i915/guc: Rename intel_guc_loader.c to intel_guc_fw.c")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1b214f53-47f5-bef3-f58e-8136de5678ed@infradead.org
(cherry picked from commit 006c23327f8de8575508c458131b304188d426f7)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 Documentation/gpu/i915.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/gpu/i915.rst b/Documentation/gpu/i915.rst
index 2e7ee0313c1c..e21698e16534 100644
--- a/Documentation/gpu/i915.rst
+++ b/Documentation/gpu/i915.rst
@@ -341,10 +341,10 @@ GuC
 GuC-specific firmware loader
 ----------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_guc_loader.c
+.. kernel-doc:: drivers/gpu/drm/i915/intel_guc_fw.c
    :doc: GuC-specific firmware loader
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_guc_loader.c
+.. kernel-doc:: drivers/gpu/drm/i915/intel_guc_fw.c
    :internal:
 
 GuC-based command submission

From df29c9db8ace4497a61f3b3d33c2b8a7fd4b7b8e Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Mon, 4 Dec 2017 14:32:46 +0100
Subject: [PATCH 190/305] omapdrm/dss/hdmi4_cec: fix interrupt handling

The omap4 CEC hardware cannot tell a Nack from a Low Drive from an
Arbitration Lost error, so just report a Nack, which is almost
certainly the reason for the error anyway.

This also simplifies the implementation. The only three interrupts
that need to be enabled are:

Transmit Buffer Full/Empty Change event: triggered when the
transmit finished successfully and cleared the buffer.

Receiver FIFO Not Empty event: triggered when a message was received.

Frame Retransmit Count Exceeded event: triggered when a transmit
failed repeatedly, usually due to the message being Nacked. Other
reasons are possible (Low Drive, Arbitration Lost) but there is no
way to know. If this happens the TX buffer needs to be cleared
manually.

While testing various error conditions I noticed that the hardware
can receive messages up to 18 bytes in total, which exceeds the legal
maximum of 16. This could cause a buffer overflow, so we check for
this and constrain the size to 16 bytes.

The old incorrect interrupt handler could cause the CEC framework to
enter into a bad state because it mis-detected the "Start Bit Irregularity
event" as an ARB_LOST transmit error when it actually is a receive error
which should be ignored.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reported-by: Henrik Austad <haustad@cisco.com>
Tested-by: Henrik Austad <haustad@cisco.com>
Tested-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 drivers/gpu/drm/omapdrm/dss/hdmi4_cec.c | 46 +++++--------------------
 1 file changed, 9 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/omapdrm/dss/hdmi4_cec.c b/drivers/gpu/drm/omapdrm/dss/hdmi4_cec.c
index e626eddf24d5..23db74ae1826 100644
--- a/drivers/gpu/drm/omapdrm/dss/hdmi4_cec.c
+++ b/drivers/gpu/drm/omapdrm/dss/hdmi4_cec.c
@@ -78,6 +78,8 @@ static void hdmi_cec_received_msg(struct hdmi_core_data *core)
 
 			/* then read the message */
 			msg.len = cnt & 0xf;
+			if (msg.len > CEC_MAX_MSG_SIZE - 2)
+				msg.len = CEC_MAX_MSG_SIZE - 2;
 			msg.msg[0] = hdmi_read_reg(core->base,
 						   HDMI_CEC_RX_CMD_HEADER);
 			msg.msg[1] = hdmi_read_reg(core->base,
@@ -104,26 +106,6 @@ static void hdmi_cec_received_msg(struct hdmi_core_data *core)
 	}
 }
 
-static void hdmi_cec_transmit_fifo_empty(struct hdmi_core_data *core, u32 stat1)
-{
-	if (stat1 & 2) {
-		u32 dbg3 = hdmi_read_reg(core->base, HDMI_CEC_DBG_3);
-
-		cec_transmit_done(core->adap,
-				  CEC_TX_STATUS_NACK |
-				  CEC_TX_STATUS_MAX_RETRIES,
-				  0, (dbg3 >> 4) & 7, 0, 0);
-	} else if (stat1 & 1) {
-		cec_transmit_done(core->adap,
-				  CEC_TX_STATUS_ARB_LOST |
-				  CEC_TX_STATUS_MAX_RETRIES,
-				  0, 0, 0, 0);
-	} else if (stat1 == 0) {
-		cec_transmit_done(core->adap, CEC_TX_STATUS_OK,
-				  0, 0, 0, 0);
-	}
-}
-
 void hdmi4_cec_irq(struct hdmi_core_data *core)
 {
 	u32 stat0 = hdmi_read_reg(core->base, HDMI_CEC_INT_STATUS_0);
@@ -132,27 +114,21 @@ void hdmi4_cec_irq(struct hdmi_core_data *core)
 	hdmi_write_reg(core->base, HDMI_CEC_INT_STATUS_0, stat0);
 	hdmi_write_reg(core->base, HDMI_CEC_INT_STATUS_1, stat1);
 
-	if (stat0 & 0x40)
+	if (stat0 & 0x20) {
+		cec_transmit_done(core->adap, CEC_TX_STATUS_OK,
+				  0, 0, 0, 0);
 		REG_FLD_MOD(core->base, HDMI_CEC_DBG_3, 0x1, 7, 7);
-	else if (stat0 & 0x24)
-		hdmi_cec_transmit_fifo_empty(core, stat1);
-	if (stat1 & 2) {
+	} else if (stat1 & 0x02) {
 		u32 dbg3 = hdmi_read_reg(core->base, HDMI_CEC_DBG_3);
 
 		cec_transmit_done(core->adap,
 				  CEC_TX_STATUS_NACK |
 				  CEC_TX_STATUS_MAX_RETRIES,
 				  0, (dbg3 >> 4) & 7, 0, 0);
-	} else if (stat1 & 1) {
-		cec_transmit_done(core->adap,
-				  CEC_TX_STATUS_ARB_LOST |
-				  CEC_TX_STATUS_MAX_RETRIES,
-				  0, 0, 0, 0);
+		REG_FLD_MOD(core->base, HDMI_CEC_DBG_3, 0x1, 7, 7);
 	}
 	if (stat0 & 0x02)
 		hdmi_cec_received_msg(core);
-	if (stat1 & 0x3)
-		REG_FLD_MOD(core->base, HDMI_CEC_DBG_3, 0x1, 7, 7);
 }
 
 static bool hdmi_cec_clear_tx_fifo(struct cec_adapter *adap)
@@ -231,18 +207,14 @@ static int hdmi_cec_adap_enable(struct cec_adapter *adap, bool enable)
 	/*
 	 * Enable CEC interrupts:
 	 * Transmit Buffer Full/Empty Change event
-	 * Transmitter FIFO Empty event
 	 * Receiver FIFO Not Empty event
 	 */
-	hdmi_write_reg(core->base, HDMI_CEC_INT_ENABLE_0, 0x26);
+	hdmi_write_reg(core->base, HDMI_CEC_INT_ENABLE_0, 0x22);
 	/*
 	 * Enable CEC interrupts:
-	 * RX FIFO Overrun Error event
-	 * Short Pulse Detected event
 	 * Frame Retransmit Count Exceeded event
-	 * Start Bit Irregularity event
 	 */
-	hdmi_write_reg(core->base, HDMI_CEC_INT_ENABLE_1, 0x0f);
+	hdmi_write_reg(core->base, HDMI_CEC_INT_ENABLE_1, 0x02);
 
 	/* cec calibration enable (self clearing) */
 	hdmi_write_reg(core->base, HDMI_CEC_SETUP, 0x03);

From 8a9bd4f8ebc6800bfc0596e28631ff6809a2f615 Mon Sep 17 00:00:00 2001
From: Stefan Haberland <sth@linux.vnet.ibm.com>
Date: Wed, 6 Dec 2017 10:30:39 +0100
Subject: [PATCH 191/305] s390/dasd: fix wrongly assigned configuration data

We store per path and per device configuration data to identify the
path or device correctly. The per path configuration data might get
mixed up if the original request gets into error recovery and is
started with a random path mask.

This would lead to a wrong identification of a path in case of a CUIR
event for example.

Fix by copying the path mask from the original request to the error
recovery request in case it is a path verification request.

Signed-off-by: Stefan Haberland <sth@linux.vnet.ibm.com>
Reviewed-by: Jan Hoeppner <hoeppner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/block/dasd_3990_erp.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c
index c94b606e0df8..ee14d8e45c97 100644
--- a/drivers/s390/block/dasd_3990_erp.c
+++ b/drivers/s390/block/dasd_3990_erp.c
@@ -2803,6 +2803,16 @@ dasd_3990_erp_action(struct dasd_ccw_req * cqr)
 		erp = dasd_3990_erp_handle_match_erp(cqr, erp);
 	}
 
+
+	/*
+	 * For path verification work we need to stick with the path that was
+	 * originally chosen so that the per path configuration data is
+	 * assigned correctly.
+	 */
+	if (test_bit(DASD_CQR_VERIFY_PATH, &erp->flags) && cqr->lpm) {
+		erp->lpm = cqr->lpm;
+	}
+
 	if (device->features & DASD_FEATURE_ERPLOG) {
 		/* print current erp_chain */
 		dev_err(&device->cdev->dev,

From 4aac2caff30fdef1db8403af81e79807811d22ea Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 28 Dec 2017 03:46:48 +0000
Subject: [PATCH 192/305] xen/pvcalls: use GFP_ATOMIC under spin lock

A spin lock is taken here so we should use GFP_ATOMIC.

Fixes: 9774c6cca266 ("xen/pvcalls: implement accept command")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/pvcalls-front.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index d1e1d8d2b9d5..4c789e61554b 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -805,7 +805,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
 		pvcalls_exit();
 		return ret;
 	}
-	map2 = kzalloc(sizeof(*map2), GFP_KERNEL);
+	map2 = kzalloc(sizeof(*map2), GFP_ATOMIC);
 	if (map2 == NULL) {
 		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
 			  (void *)&map->passive.flags);

From af2e01da344e9f90e38d039c39385882d7364c0f Mon Sep 17 00:00:00 2001
From: Markus Heiser <markus.heiser@darmarit.de>
Date: Tue, 12 Dec 2017 12:38:37 +0100
Subject: [PATCH 193/305] docs: fix, intel_guc_loader.c has been moved to
 intel_guc_fw.c

With commit d9e2e0143c the 'GuC-specific firmware loader' doc
section was removed from intel_guc_loader.c without a
replacement.  So lets remove it from the Kernel-doc::

  .. kernel-doc:: drivers/gpu/drm/i915/intel_guc_loader.c
     :doc: GuC-specific firmware loader

With commit e8668bbcb0 intel_guc_loader.c was renamed to to
intel_guc_fw.c and to name just one, intel_guc_init_hw() was
renamed to intel_guc_fw_upload(). Since we get errors in the
Sphinx build like:

- Error: Cannot open file ./drivers/gpu/drm/i915/intel_guc_loader.c

Change the kernel-doc directive from intel_guc_loader.c to
intel_guc_fw.c

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
[danvet: Rebase onto the partial fix 006c23327f8d
("documentation/gpu/i915: fix docs build error after file rename")]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1513078717-12373-1-git-send-email-markus.heiser@darmarit.de
(cherry picked from commit 0132a1a5d44d2cd32a249dbe999a88c2134a6bd1)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 Documentation/gpu/i915.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Documentation/gpu/i915.rst b/Documentation/gpu/i915.rst
index e21698e16534..e94d3ac2bdd0 100644
--- a/Documentation/gpu/i915.rst
+++ b/Documentation/gpu/i915.rst
@@ -341,9 +341,6 @@ GuC
 GuC-specific firmware loader
 ----------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_guc_fw.c
-   :doc: GuC-specific firmware loader
-
 .. kernel-doc:: drivers/gpu/drm/i915/intel_guc_fw.c
    :internal:
 

From 57d72e159b60456c8bb281736c02ddd3164037aa Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Thu, 14 Dec 2017 11:03:01 +0000
Subject: [PATCH 194/305] iommu/arm-smmu-v3: Don't free page table ops twice

Kasan reports a double free when finalise_stage_fn fails: the io_pgtable
ops are freed by arm_smmu_domain_finalise and then again by
arm_smmu_domain_free. Prevent this by leaving pgtbl_ops empty on failure.

Cc: <stable@vger.kernel.org>
Fixes: 48ec83bcbcf5 ("iommu/arm-smmu: Add initial driver support for ARM SMMUv3 devices")
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/arm-smmu-v3.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index f122071688fd..db4281d0e269 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1698,13 +1698,15 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
 	domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
 	domain->geometry.aperture_end = (1UL << ias) - 1;
 	domain->geometry.force_aperture = true;
-	smmu_domain->pgtbl_ops = pgtbl_ops;
 
 	ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg);
-	if (ret < 0)
+	if (ret < 0) {
 		free_io_pgtable_ops(pgtbl_ops);
+		return ret;
+	}
 
-	return ret;
+	smmu_domain->pgtbl_ops = pgtbl_ops;
+	return 0;
 }
 
 static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)

From 563b5cbe334e9503ab2b234e279d500fc4f76018 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 2 Jan 2018 12:33:14 +0000
Subject: [PATCH 195/305] iommu/arm-smmu-v3: Cope with duplicated Stream IDs

For PCI devices behind an aliasing PCIe-to-PCI/X bridge, the bridge
alias to DevFn 0.0 on the subordinate bus may match the original RID of
the device, resulting in the same SID being present in the device's
fwspec twice. This causes trouble later in arm_smmu_write_strtab_ent()
when we wind up visiting the STE a second time and find it already live.

Avoid the issue by giving arm_smmu_install_ste_for_dev() the cleverness
to skip over duplicates. It seems mildly counterintuitive compared to
preventing the duplicates from existing in the first place, but since
the DT and ACPI probe paths build their fwspecs differently, this is
actually the cleanest and most self-contained way to deal with it.

Cc: <stable@vger.kernel.org>
Fixes: 8f78515425da ("iommu/arm-smmu: Implement of_xlate() for SMMUv3")
Reported-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Tomasz Nowicki <Tomasz.Nowicki@cavium.com>
Tested-by: Jayachandran C. <jnair@caviumnetworks.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/arm-smmu-v3.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index db4281d0e269..744592d330ca 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1733,7 +1733,7 @@ static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
 
 static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec)
 {
-	int i;
+	int i, j;
 	struct arm_smmu_master_data *master = fwspec->iommu_priv;
 	struct arm_smmu_device *smmu = master->smmu;
 
@@ -1741,6 +1741,13 @@ static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec)
 		u32 sid = fwspec->ids[i];
 		__le64 *step = arm_smmu_get_step_for_sid(smmu, sid);
 
+		/* Bridged PCI devices may end up with duplicated IDs */
+		for (j = 0; j < i; j++)
+			if (fwspec->ids[j] == sid)
+				break;
+		if (j < i)
+			continue;
+
 		arm_smmu_write_strtab_ent(smmu, sid, step, &master->ste);
 	}
 }

From 55a5ec9b77106ffc05e8c40d7568432bf4696d7b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 2 Jan 2018 11:45:07 -0500
Subject: [PATCH 196/305] Revert "net: core: dev_get_valid_name is now the same
 as dev_alloc_name_ns"

This reverts commit 87c320e51519a83c496ab7bfb4e96c8f9c001e89.

Changing the error return code in some situations turns out to
be harmful in practice.  In particular Michael Ellerman reports
that DHCP fails on his powerpc machines, and this revert gets
things working again.

Johannes Berg agrees that this revert is the best course of
action for now.

Fixes: 029b6d140550 ("Revert "net: core: maybe return -EEXIST in __dev_alloc_name"")
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 01ee854454a8..0e0ba36eeac9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1146,7 +1146,19 @@ EXPORT_SYMBOL(dev_alloc_name);
 int dev_get_valid_name(struct net *net, struct net_device *dev,
 		       const char *name)
 {
-	return dev_alloc_name_ns(net, dev, name);
+	BUG_ON(!net);
+
+	if (!dev_valid_name(name))
+		return -EINVAL;
+
+	if (strchr(name, '%'))
+		return dev_alloc_name_ns(net, dev, name);
+	else if (__dev_get_by_name(net, name))
+		return -EEXIST;
+	else if (dev->name != name)
+		strlcpy(dev->name, name, IFNAMSIZ);
+
+	return 0;
 }
 EXPORT_SYMBOL(dev_get_valid_name);
 

From beed9263f4000c48a5c48912f26576f6fa091181 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 13 Dec 2017 13:50:07 +0200
Subject: [PATCH 197/305] btrfs: Fix flush bio leak

Commit e0ae99941423 ("btrfs: preallocate device flush bio") reworked
the way the flush bio is allocated and used. Concretely it allocates
the bio in __alloc_device and then re-uses it multiple times with a
very simple endio routine that just calls complete() without consuming
a reference. Allocated bios by default come with a ref count of 1,
which is then consumed by the endio routine (or not, in which case they
should be bio_put by the caller). The way the impleementation works now
is that the flush bio has a refcount of 2 and we only ever bio_put it
once, leaving it to hang indefinitely. Fix this by removing the extra
bio_get in __alloc_device.

Fixes: e0ae99941423 ("btrfs: preallocate device flush bio")
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d48b24e54366..94d28f549837 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -237,7 +237,6 @@ static struct btrfs_device *__alloc_device(void)
 		kfree(dev);
 		return ERR_PTR(-ENOMEM);
 	}
-	bio_get(dev->flush_bio);
 
 	INIT_LIST_HEAD(&dev->dev_list);
 	INIT_LIST_HEAD(&dev->dev_alloc_list);

From ec35e48b286959991cdbb886f1bdeda4575c80b4 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Fri, 15 Dec 2017 11:58:27 -0800
Subject: [PATCH 198/305] btrfs: fix refcount_t usage when deleting
 btrfs_delayed_nodes

refcounts have a generic implementation and an asm optimized one.  The
generic version has extra debugging to make sure that once a refcount
goes to zero, refcount_inc won't increase it.

The btrfs delayed inode code wasn't expecting this, and we're tripping
over the warnings when the generic refcounts are used.  We ended up with
this race:

Process A                                         Process B
                                                  btrfs_get_delayed_node()
						  spin_lock(root->inode_lock)
						  radix_tree_lookup()
__btrfs_release_delayed_node()
refcount_dec_and_test(&delayed_node->refs)
our refcount is now zero
						  refcount_add(2) <---
						  warning here, refcount
                                                  unchanged

spin_lock(root->inode_lock)
radix_tree_delete()

With the generic refcounts, we actually warn again when process B above
tries to release his refcount because refcount_add() turned into a
no-op.

We saw this in production on older kernels without the asm optimized
refcounts.

The fix used here is to use refcount_inc_not_zero() to detect when the
object is in the middle of being freed and return NULL.  This is almost
always the right answer anyway, since we usually end up pitching the
delayed_node if it didn't have fresh data in it.

This also changes __btrfs_release_delayed_node() to remove the extra
check for zero refcounts before radix tree deletion.
btrfs_get_delayed_node() was the only path that was allowing refcounts
to go from zero to one.

Fixes: 6de5f18e7b0da ("btrfs: fix refcount_t usage when deleting btrfs_delayed_node")
CC: <stable@vger.kernel.org> # 4.12+
Signed-off-by: Chris Mason <clm@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 45 ++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5d73f79ded8b..056276101c63 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -87,6 +87,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 
 	spin_lock(&root->inode_lock);
 	node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+
 	if (node) {
 		if (btrfs_inode->delayed_node) {
 			refcount_inc(&node->refs);	/* can be accessed */
@@ -94,9 +95,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 			spin_unlock(&root->inode_lock);
 			return node;
 		}
-		btrfs_inode->delayed_node = node;
-		/* can be accessed and cached in the inode */
-		refcount_add(2, &node->refs);
+
+		/*
+		 * It's possible that we're racing into the middle of removing
+		 * this node from the radix tree.  In this case, the refcount
+		 * was zero and it should never go back to one.  Just return
+		 * NULL like it was never in the radix at all; our release
+		 * function is in the process of removing it.
+		 *
+		 * Some implementations of refcount_inc refuse to bump the
+		 * refcount once it has hit zero.  If we don't do this dance
+		 * here, refcount_inc() may decide to just WARN_ONCE() instead
+		 * of actually bumping the refcount.
+		 *
+		 * If this node is properly in the radix, we want to bump the
+		 * refcount twice, once for the inode and once for this get
+		 * operation.
+		 */
+		if (refcount_inc_not_zero(&node->refs)) {
+			refcount_inc(&node->refs);
+			btrfs_inode->delayed_node = node;
+		} else {
+			node = NULL;
+		}
+
 		spin_unlock(&root->inode_lock);
 		return node;
 	}
@@ -254,17 +276,18 @@ static void __btrfs_release_delayed_node(
 	mutex_unlock(&delayed_node->mutex);
 
 	if (refcount_dec_and_test(&delayed_node->refs)) {
-		bool free = false;
 		struct btrfs_root *root = delayed_node->root;
+
 		spin_lock(&root->inode_lock);
-		if (refcount_read(&delayed_node->refs) == 0) {
-			radix_tree_delete(&root->delayed_nodes_tree,
-					  delayed_node->inode_id);
-			free = true;
-		}
+		/*
+		 * Once our refcount goes to zero, nobody is allowed to bump it
+		 * back up.  We can delete it now.
+		 */
+		ASSERT(refcount_read(&delayed_node->refs) == 0);
+		radix_tree_delete(&root->delayed_nodes_tree,
+				  delayed_node->inode_id);
 		spin_unlock(&root->inode_lock);
-		if (free)
-			kmem_cache_free(delayed_node_cache, delayed_node);
+		kmem_cache_free(delayed_node_cache, delayed_node);
 	}
 }
 

From 23263ec86a5f44312d2899323872468752324107 Mon Sep 17 00:00:00 2001
From: Eli Cooper <elicooper@gmx.com>
Date: Mon, 25 Dec 2017 10:43:49 +0800
Subject: [PATCH 199/305] ip6_tunnel: disable dst caching if tunnel is
 dual-stack

When an ip6_tunnel is in mode 'any', where the transport layer
protocol can be either 4 or 41, dst_cache must be disabled.

This is because xfrm policies might apply to only one of the two
protocols. Caching dst would cause xfrm policies for one protocol
incorrectly used for the other.

Signed-off-by: Eli Cooper <elicooper@gmx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 931c38f6ff4a..b263c809d8d4 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1074,10 +1074,11 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 			memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
 			neigh_release(neigh);
 		}
-	} else if (!(t->parms.flags &
-		     (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) {
-		/* enable the cache only only if the routing decision does
-		 * not depend on the current inner header value
+	} else if (t->parms.proto != 0 && !(t->parms.flags &
+					    (IP6_TNL_F_USE_ORIG_TCLASS |
+					     IP6_TNL_F_USE_ORIG_FWMARK))) {
+		/* enable the cache only if neither the outer protocol nor the
+		 * routing decision depends on the current inner header value
 		 */
 		use_cache = true;
 	}

From 52a589d51f1008f62569bf89e95b26221ee76690 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 25 Dec 2017 14:43:58 +0800
Subject: [PATCH 200/305] geneve: update skb dst pmtu on tx path

Commit a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path") has fixed
a performance issue caused by the change of lower dev's mtu for vxlan.

The same thing needs to be done for geneve as well.

Note that geneve cannot adjust it's mtu according to lower dev's mtu
when creating it. The performance is very low later when netperfing
over it without fixing the mtu manually. This patch could also avoid
this issue.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index b718a02a6bb6..0a48b3073d3d 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -825,6 +825,13 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
 
+	if (skb_dst(skb)) {
+		int mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr) -
+			  GENEVE_BASE_HLEN - info->options_len - 14;
+
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+	}
+
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	if (geneve->collect_md) {
 		tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
@@ -864,6 +871,13 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
 
+	if (skb_dst(skb)) {
+		int mtu = dst_mtu(dst) - sizeof(struct ipv6hdr) -
+			  GENEVE_BASE_HLEN - info->options_len - 14;
+
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+	}
+
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	if (geneve->collect_md) {
 		prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);

From 2fa771be953a17f8e0a9c39103464c2574444c62 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 25 Dec 2017 14:45:12 +0800
Subject: [PATCH 201/305] ip6_tunnel: allow ip6gre dev mtu to be set below 1280

Commit 582442d6d5bc ("ipv6: Allow the MTU of ipip6 tunnel to be set
below 1280") fixed a mtu setting issue. It works for ipip6 tunnel.

But ip6gre dev updates the mtu also with ip6_tnl_change_mtu. Since
the inner packet over ip6gre can be ipv4 and it's mtu should also
be allowed to set below 1280, the same issue also exists on ip6gre.

This patch is to fix it by simply changing to check if parms.proto
is IPPROTO_IPV6 in ip6_tnl_change_mtu instead, to make ip6gre to
go to 'else' branch.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index b263c809d8d4..9a7cf355bc8c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1677,11 +1677,11 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
 {
 	struct ip6_tnl *tnl = netdev_priv(dev);
 
-	if (tnl->parms.proto == IPPROTO_IPIP) {
-		if (new_mtu < ETH_MIN_MTU)
+	if (tnl->parms.proto == IPPROTO_IPV6) {
+		if (new_mtu < IPV6_MIN_MTU)
 			return -EINVAL;
 	} else {
-		if (new_mtu < IPV6_MIN_MTU)
+		if (new_mtu < ETH_MIN_MTU)
 			return -EINVAL;
 	}
 	if (new_mtu > 0xFFF8 - dev->hard_header_len)

From 8764a8267b128405cf383157d5e9a4a3735d2409 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 25 Dec 2017 08:57:35 +0100
Subject: [PATCH 202/305] mlxsw: spectrum_router: Fix NULL pointer deref

When we remove the neighbour associated with a nexthop we should always
refuse to write the nexthop to the adjacency table. Regardless if it is
already present in the table or not.

Otherwise, we risk dereferencing the NULL pointer that was set instead
of the neighbour.

Fixes: a7ff87acd995 ("mlxsw: spectrum_router: Implement next-hop routing")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Alexander Petrovskiy <alexpe@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index be657b8533f0..434b3922b34f 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -3228,7 +3228,7 @@ static void __mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp_nexthop *nh,
 {
 	if (!removing)
 		nh->should_offload = 1;
-	else if (nh->offloaded)
+	else
 		nh->should_offload = 0;
 	nh->update = 1;
 }

From 90045fc9c78855bdc625a0ab185d97b72a937613 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 25 Dec 2017 09:05:33 +0100
Subject: [PATCH 203/305] mlxsw: spectrum: Relax sanity checks during
 enslavement

Since commit 25cc72a33835 ("mlxsw: spectrum: Forbid linking to devices that
have uppers") the driver forbids enslavement to netdevs that already
have uppers of their own, as this can result in various ordering
problems.

This requirement proved to be too strict for some users who need to be
able to enslave ports to a bridge that already has uppers. In this case,
we can allow the enslavement if the bridge is already known to us, as
any configuration performed on top of the bridge was already reflected
to the device.

Fixes: 25cc72a33835 ("mlxsw: spectrum: Forbid linking to devices that have uppers")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Alexander Petrovskiy <alexpe@mellanox.com>
Tested-by: Alexander Petrovskiy <alexpe@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c        | 11 +++++++++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h        |  2 ++
 .../net/ethernet/mellanox/mlxsw/spectrum_switchdev.c  |  6 ++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 9bd8d28de152..c3837ca7a705 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4376,7 +4376,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev,
 		}
 		if (!info->linking)
 			break;
-		if (netdev_has_any_upper_dev(upper_dev)) {
+		if (netdev_has_any_upper_dev(upper_dev) &&
+		    (!netif_is_bridge_master(upper_dev) ||
+		     !mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp,
+							  upper_dev))) {
 			NL_SET_ERR_MSG(extack,
 				       "spectrum: Enslaving a port to a device that already has an upper device is not supported");
 			return -EINVAL;
@@ -4504,6 +4507,7 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
 					      u16 vid)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct netdev_notifier_changeupper_info *info = ptr;
 	struct netlink_ext_ack *extack;
 	struct net_device *upper_dev;
@@ -4520,7 +4524,10 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
 		}
 		if (!info->linking)
 			break;
-		if (netdev_has_any_upper_dev(upper_dev)) {
+		if (netdev_has_any_upper_dev(upper_dev) &&
+		    (!netif_is_bridge_master(upper_dev) ||
+		     !mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp,
+							  upper_dev))) {
 			NL_SET_ERR_MSG(extack, "spectrum: Enslaving a port to a device that already has an upper device is not supported");
 			return -EINVAL;
 		}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 432ab9b12b7f..05ce1befd9b3 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -365,6 +365,8 @@ int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
 void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
 				struct net_device *brport_dev,
 				struct net_device *br_dev);
+bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
+					 const struct net_device *br_dev);
 
 /* spectrum.c */
 int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 7b8548e25ae7..593ad31be749 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -152,6 +152,12 @@ mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge,
 	return NULL;
 }
 
+bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
+					 const struct net_device *br_dev)
+{
+	return !!mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev);
+}
+
 static struct mlxsw_sp_bridge_device *
 mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge,
 			      struct net_device *br_dev)

From 02a0d9216d4daf6a58d88642bd2da2c78c327552 Mon Sep 17 00:00:00 2001
From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Date: Tue, 2 Jan 2018 09:39:25 -0800
Subject: [PATCH 204/305] Input: xen-kbdfront - do not advertise multi-touch
 pressure support

Some user-space applications expect multi-touch pressure
on contact to be reported if it is advertised in device
properties. Otherwise, such applications may treat reports
not as actual touches, but hovering. Currently this is
only advertised, but not reported.
Fix this by not advertising that ABS_MT_PRESSURE is supported.

Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Signed-off-by: Andrii Chepurnyi <andrii_chepurnyi@epam.com>
Patchwork-Id: 10140017
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/misc/xen-kbdfront.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/input/misc/xen-kbdfront.c b/drivers/input/misc/xen-kbdfront.c
index 6bf56bb5f8d9..d91f3b1c5375 100644
--- a/drivers/input/misc/xen-kbdfront.c
+++ b/drivers/input/misc/xen-kbdfront.c
@@ -326,8 +326,6 @@ static int xenkbd_probe(struct xenbus_device *dev,
 				     0, width, 0, 0);
 		input_set_abs_params(mtouch, ABS_MT_POSITION_Y,
 				     0, height, 0, 0);
-		input_set_abs_params(mtouch, ABS_MT_PRESSURE,
-				     0, 255, 0, 0);
 
 		ret = input_mt_init_slots(mtouch, num_cont, INPUT_MT_DIRECT);
 		if (ret) {

From 5a371cf87e145b86efd32007e46146e78c1eff6d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 31 Dec 2017 15:33:14 +0200
Subject: [PATCH 205/305] IB/mlx4: Fix mlx4_ib_alloc_mr error flow

ibmr.device is being set only after ib_alloc_mr() is successfully complete.
Therefore, in case imlx4_mr_enable() returns with error, the error flow
unwinder calls to mlx4_free_priv_pages(), which uses ibmr.device.

Such usage causes to NULL dereference oops and to fix it, the IB device
should be set in the mr struct earlier stage (e.g. prior to calling
mlx4_free_priv_pages()).

Fixes: 1b2cd0fc673c ("IB/mlx4: Support the new memory registration API")
Signed-off-by: Nitzan Carmi <nitzanc@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx4/mr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 313bfb9ccb71..4975f3e6596e 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -642,7 +642,6 @@ struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
 		goto err_free_mr;
 
 	mr->max_pages = max_num_sg;
-
 	err = mlx4_mr_enable(dev->dev, &mr->mmr);
 	if (err)
 		goto err_free_pl;
@@ -653,6 +652,7 @@ struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
 	return &mr->ibmr;
 
 err_free_pl:
+	mr->ibmr.device = pd->device;
 	mlx4_free_priv_pages(mr);
 err_free_mr:
 	(void) mlx4_mr_free(dev->dev, &mr->mmr);

From 16ba3defb8bd01a9464ba4820a487f5b196b455b Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@mellanox.com>
Date: Sun, 31 Dec 2017 15:33:15 +0200
Subject: [PATCH 206/305] IB/ipoib: Fix race condition in neigh creation

When using enhanced mode for IPoIB, two threads may execute xmit in
parallel to two different TX queues while the target is the same.
In this case, both of them will add the same neighbor to the path's
neigh link list and we might see the following message:

  list_add double add: new=ffff88024767a348, prev=ffff88024767a348...
  WARNING: lib/list_debug.c:31__list_add_valid+0x4e/0x70
  ipoib_start_xmit+0x477/0x680 [ib_ipoib]
  dev_hard_start_xmit+0xb9/0x3e0
  sch_direct_xmit+0xf9/0x250
  __qdisc_run+0x176/0x5d0
  __dev_queue_xmit+0x1f5/0xb10
  __dev_queue_xmit+0x55/0xb10

Analysis:
Two SKB are scheduled to be transmitted from two cores.
In ipoib_start_xmit, both gets NULL when calling ipoib_neigh_get.
Two calls to neigh_add_path are made. One thread takes the spin-lock
and calls ipoib_neigh_alloc which creates the neigh structure,
then (after the __path_find) the neigh is added to the path's neigh
link list. When the second thread enters the critical section it also
calls ipoib_neigh_alloc but in this case it gets the already allocated
ipoib_neigh structure, which is already linked to the path's neigh
link list and adds it again to the list. Which beside of triggering
the list, it creates a loop in the linked list. This loop leads to
endless loop inside path_rec_completion.

Solution:
Check list_empty(&neigh->list) before adding to the list.
Add a similar fix in "ipoib_multicast.c::ipoib_mcast_send"

Fixes: b63b70d87741 ('IPoIB: Use a private hash table for path lookup in xmit path')
Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Reviewed-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c     | 25 +++++++++++++------
 .../infiniband/ulp/ipoib/ipoib_multicast.c    |  5 +++-
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 12b7f911f0e5..8880351df179 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -902,8 +902,8 @@ static int path_rec_start(struct net_device *dev,
 	return 0;
 }
 
-static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
-			   struct net_device *dev)
+static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr,
+					  struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
 	struct rdma_netdev *rn = netdev_priv(dev);
@@ -917,7 +917,15 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 		spin_unlock_irqrestore(&priv->lock, flags);
 		++dev->stats.tx_dropped;
 		dev_kfree_skb_any(skb);
-		return;
+		return NULL;
+	}
+
+	/* To avoid race condition, make sure that the
+	 * neigh will be added only once.
+	 */
+	if (unlikely(!list_empty(&neigh->list))) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		return neigh;
 	}
 
 	path = __path_find(dev, daddr + 4);
@@ -956,7 +964,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 			path->ah->last_send = rn->send(dev, skb, path->ah->ah,
 						       IPOIB_QPN(daddr));
 			ipoib_neigh_put(neigh);
-			return;
+			return NULL;
 		}
 	} else {
 		neigh->ah  = NULL;
@@ -973,7 +981,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 	ipoib_neigh_put(neigh);
-	return;
+	return NULL;
 
 err_path:
 	ipoib_neigh_free(neigh);
@@ -983,6 +991,8 @@ err_drop:
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 	ipoib_neigh_put(neigh);
+
+	return NULL;
 }
 
 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
@@ -1091,8 +1101,9 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	case htons(ETH_P_TIPC):
 		neigh = ipoib_neigh_get(dev, phdr->hwaddr);
 		if (unlikely(!neigh)) {
-			neigh_add_path(skb, phdr->hwaddr, dev);
-			return NETDEV_TX_OK;
+			neigh = neigh_add_path(skb, phdr->hwaddr, dev);
+			if (likely(!neigh))
+				return NETDEV_TX_OK;
 		}
 		break;
 	case htons(ETH_P_ARP):
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 93e149efc1f5..9b3f47ae2016 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -816,7 +816,10 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
 		spin_lock_irqsave(&priv->lock, flags);
 		if (!neigh) {
 			neigh = ipoib_neigh_alloc(daddr, dev);
-			if (neigh) {
+			/* Make sure that the neigh will be added only
+			 * once to mcast list.
+			 */
+			if (neigh && list_empty(&neigh->list)) {
 				kref_get(&mcast->ah->ref);
 				neigh->ah	= mcast->ah;
 				list_add_tail(&neigh->list, &mcast->neigh_list);

From 2196881566225f3c3428d1a5f847a992944daa5b Mon Sep 17 00:00:00 2001
From: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Date: Thu, 21 Dec 2017 13:18:26 -0800
Subject: [PATCH 207/305] xfs: quota: fix missed destroy of qi_tree_lock

xfs_qm_destroy_quotainfo() does not destroy quotainfo->qi_tree_lock
while destroys quotainfo->qi_quotaofflock.

Signed-off-by: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_qm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index ec952dfad359..d0053115427f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -736,6 +736,7 @@ xfs_qm_destroy_quotainfo(
 		IRELE(qi->qi_pquotaip);
 		qi->qi_pquotaip = NULL;
 	}
+	mutex_destroy(&qi->qi_tree_lock);
 	mutex_destroy(&qi->qi_quotaofflock);
 	kmem_free(qi);
 	mp->m_quotainfo = NULL;

From 3a3882ff26fbdbaf5f7e13f6a0bccfbf7121041d Mon Sep 17 00:00:00 2001
From: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Date: Thu, 21 Dec 2017 13:18:26 -0800
Subject: [PATCH 208/305] xfs: quota: check result of register_shrinker()

xfs_qm_init_quotainfo() does not check result of register_shrinker()
which was tagged as __must_check recently, reported by sparse.

Signed-off-by: Aliaksei Karaliou <akaraliou.dev@gmail.com>
[darrick: move xfs_qm_destroy_quotainos nearer xfs_qm_init_quotainos]
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_qm.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index d0053115427f..b897b11afb2c 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -48,7 +48,7 @@
 STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
 
-
+STATIC void	xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi);
 STATIC void	xfs_qm_dqfree_one(struct xfs_dquot *dqp);
 /*
  * We use the batch lookup interface to iterate over the dquots as it
@@ -695,9 +695,17 @@ xfs_qm_init_quotainfo(
 	qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
 	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
 	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
-	register_shrinker(&qinf->qi_shrinker);
+
+	error = register_shrinker(&qinf->qi_shrinker);
+	if (error)
+		goto out_free_inos;
+
 	return 0;
 
+out_free_inos:
+	mutex_destroy(&qinf->qi_quotaofflock);
+	mutex_destroy(&qinf->qi_tree_lock);
+	xfs_qm_destroy_quotainos(qinf);
 out_free_lru:
 	list_lru_destroy(&qinf->qi_lru);
 out_free_qinf:
@@ -706,7 +714,6 @@ out_free_qinf:
 	return error;
 }
 
-
 /*
  * Gets called when unmounting a filesystem or when all quotas get
  * turned off.
@@ -723,19 +730,7 @@ xfs_qm_destroy_quotainfo(
 
 	unregister_shrinker(&qi->qi_shrinker);
 	list_lru_destroy(&qi->qi_lru);
-
-	if (qi->qi_uquotaip) {
-		IRELE(qi->qi_uquotaip);
-		qi->qi_uquotaip = NULL; /* paranoia */
-	}
-	if (qi->qi_gquotaip) {
-		IRELE(qi->qi_gquotaip);
-		qi->qi_gquotaip = NULL;
-	}
-	if (qi->qi_pquotaip) {
-		IRELE(qi->qi_pquotaip);
-		qi->qi_pquotaip = NULL;
-	}
+	xfs_qm_destroy_quotainos(qi);
 	mutex_destroy(&qi->qi_tree_lock);
 	mutex_destroy(&qi->qi_quotaofflock);
 	kmem_free(qi);
@@ -1600,6 +1595,24 @@ error_rele:
 	return error;
 }
 
+STATIC void
+xfs_qm_destroy_quotainos(
+	xfs_quotainfo_t	*qi)
+{
+	if (qi->qi_uquotaip) {
+		IRELE(qi->qi_uquotaip);
+		qi->qi_uquotaip = NULL; /* paranoia */
+	}
+	if (qi->qi_gquotaip) {
+		IRELE(qi->qi_gquotaip);
+		qi->qi_gquotaip = NULL;
+	}
+	if (qi->qi_pquotaip) {
+		IRELE(qi->qi_pquotaip);
+		qi->qi_pquotaip = NULL;
+	}
+}
+
 STATIC void
 xfs_qm_dqfree_one(
 	struct xfs_dquot	*dqp)

From b4d8ad7fd3a18e6d92d4ebe858185c704604a57d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 22 Dec 2017 13:14:34 -0800
Subject: [PATCH 209/305] xfs: fix s_maxbytes overflow problems

Fix some integer overflow problems if offset + count happen to be large
enough to cause an integer overflow.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c  | 4 ++--
 fs/xfs/xfs_iomap.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 21e2d70884e1..4fc526a27a94 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -399,7 +399,7 @@ xfs_map_blocks(
 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
 	ASSERT(offset <= mp->m_super->s_maxbytes);
 
-	if ((xfs_ufsize_t)offset + count > mp->m_super->s_maxbytes)
+	if (offset > mp->m_super->s_maxbytes - count)
 		count = mp->m_super->s_maxbytes - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1312,7 +1312,7 @@ xfs_get_blocks(
 	lockmode = xfs_ilock_data_map_shared(ip);
 
 	ASSERT(offset <= mp->m_super->s_maxbytes);
-	if ((xfs_ufsize_t)offset + size > mp->m_super->s_maxbytes)
+	if (offset > mp->m_super->s_maxbytes - size)
 		size = mp->m_super->s_maxbytes - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7ab52a8bc0a9..66e1edbfb2b2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1006,7 +1006,7 @@ xfs_file_iomap_begin(
 	}
 
 	ASSERT(offset <= mp->m_super->s_maxbytes);
-	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+	if (offset > mp->m_super->s_maxbytes - length)
 		length = mp->m_super->s_maxbytes - offset;
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);

From 3bb23421a504f01551b7cb9dff0e41dbf16656b0 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Tue, 26 Dec 2017 07:48:51 +0200
Subject: [PATCH 210/305] net/sched: Fix update of lastuse in act modules
 implementing stats_update

We need to update lastuse to to the most updated value between what
is already set and the new value.
If HW matching fails, i.e. because of an issue, the stats are not updated
but it could be that software did match and updated lastuse.

Fixes: 5712bf9c5c30 ("net/sched: act_mirred: Use passed lastuse argument")
Fixes: 9fea47d93bcc ("net/sched: act_gact: Update statistics when offloaded to hardware")
Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_gact.c   | 2 +-
 net/sched/act_mirred.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e29a48ef7fc3..a0ac42b3ed06 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -159,7 +159,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	if (action == TC_ACT_SHOT)
 		this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
 
-	tm->lastuse = lastuse;
+	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
 static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8b3e59388480..08b61849c2a2 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -239,7 +239,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	struct tcf_t *tm = &m->tcf_tm;
 
 	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
-	tm->lastuse = lastuse;
+	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
 static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,

From d02fd6e7d2933ede6478a15f9e4ce8a93845824e Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Tue, 26 Dec 2017 21:44:32 +0800
Subject: [PATCH 211/305] macvlan: Fix one possible double free

Because the macvlan_uninit would free the macvlan port, so there is one
double free case in macvlan_common_newlink. When the macvlan port is just
created, then register_netdevice or netdev_upper_dev_link failed and they
would invoke macvlan_uninit. Then it would reach the macvlan_port_destroy
which triggers the double free.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index a178c5efd33e..a0f2be81d52e 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1444,9 +1444,14 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 	return 0;
 
 unregister_netdev:
+	/* macvlan_uninit would free the macvlan port */
 	unregister_netdevice(dev);
+	return err;
 destroy_macvlan_port:
-	if (create)
+	/* the macvlan port may be freed by macvlan_uninit when fail to register.
+	 * so we destroy the macvlan port only when it's valid.
+	 */
+	if (create && macvlan_port_get_rtnl(dev))
 		macvlan_port_destroy(port->dev);
 	return err;
 }

From ac817f5ad066697e4d4d35ec68c974eba2c5f17a Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 26 Dec 2017 23:15:12 +0000
Subject: [PATCH 212/305] phylink: ensure we report link down when LOS asserted

Although we disable the netdev carrier, we fail to report in the kernel
log that the link went down.  Fix this.

Fixes: 9525ae83959b ("phylink: add phylink infrastructure")
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 827f3f92560e..150cd95a6e1e 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1429,9 +1429,8 @@ static void phylink_sfp_link_down(void *upstream)
 	WARN_ON(!lockdep_rtnl_is_held());
 
 	set_bit(PHYLINK_DISABLE_LINK, &pl->phylink_disable_state);
+	queue_work(system_power_efficient_wq, &pl->resolve);
 	flush_work(&pl->resolve);
-
-	netif_carrier_off(pl->netdev);
 }
 
 static void phylink_sfp_link_up(void *upstream)

From 0b2122e4934c7783d336397864e34ee53aad0965 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 26 Dec 2017 23:15:17 +0000
Subject: [PATCH 213/305] sfp: fix sfp-bus oops when removing socket/upstream

When we remove a socket or upstream, and the other side isn't
registered, we dereference a NULL pointer, causing a kernel oops.
Fix this.

Fixes: ce0aa27ff3f6 ("sfp: add sfp-bus to bridge between network devices and sfp cages")
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp-bus.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 8a1b1f4c1b7c..ab64a142b832 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -356,7 +356,8 @@ EXPORT_SYMBOL_GPL(sfp_register_upstream);
 void sfp_unregister_upstream(struct sfp_bus *bus)
 {
 	rtnl_lock();
-	sfp_unregister_bus(bus);
+	if (bus->sfp)
+		sfp_unregister_bus(bus);
 	bus->upstream = NULL;
 	bus->netdev = NULL;
 	rtnl_unlock();
@@ -459,7 +460,8 @@ EXPORT_SYMBOL_GPL(sfp_register_socket);
 void sfp_unregister_socket(struct sfp_bus *bus)
 {
 	rtnl_lock();
-	sfp_unregister_bus(bus);
+	if (bus->netdev)
+		sfp_unregister_bus(bus);
 	bus->sfp_dev = NULL;
 	bus->sfp = NULL;
 	bus->socket_ops = NULL;

From 0b76aae741abb9d16d2c0e67f8b1e766576f897d Mon Sep 17 00:00:00 2001
From: Tushar Dave <tushar.n.dave@oracle.com>
Date: Wed, 6 Dec 2017 02:26:29 +0530
Subject: [PATCH 214/305] e1000: fix disabling already-disabled warning

This patch adds check so that driver does not disable already
disabled device.

[   44.637743] advantechwdt: Unexpected close, not stopping watchdog!
[   44.997548] input: ImExPS/2 Generic Explorer Mouse as /devices/platform/i8042/serio1/input/input6
[   45.013419] e1000 0000:00:03.0: disabling already-disabled device
[   45.013447] ------------[ cut here ]------------
[   45.014868] WARNING: CPU: 1 PID: 71 at drivers/pci/pci.c:1641 pci_disable_device+0xa1/0x105:
						pci_disable_device at drivers/pci/pci.c:1640
[   45.016171] CPU: 1 PID: 71 Comm: rcu_perf_shutdo Not tainted 4.14.0-01330-g3c07399 #1
[   45.017197] task: ffff88011bee9e40 task.stack: ffffc90000860000
[   45.017987] RIP: 0010:pci_disable_device+0xa1/0x105:
						pci_disable_device at drivers/pci/pci.c:1640
[   45.018603] RSP: 0000:ffffc90000863e30 EFLAGS: 00010286
[   45.019282] RAX: 0000000000000035 RBX: ffff88013a230008 RCX: 0000000000000000
[   45.020182] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000203
[   45.021084] RBP: ffff88013a3f31e8 R08: 0000000000000001 R09: 0000000000000000
[   45.021986] R10: ffffffff827ec29c R11: 0000000000000002 R12: 0000000000000001
[   45.022946] R13: ffff88013a230008 R14: ffff880117802b20 R15: ffffc90000863e8f
[   45.023842] FS:  0000000000000000(0000) GS:ffff88013fd00000(0000) knlGS:0000000000000000
[   45.024863] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   45.025583] CR2: ffffc900006d4000 CR3: 000000000220f000 CR4: 00000000000006a0
[   45.026478] Call Trace:
[   45.026811]  __e1000_shutdown+0x1d4/0x1e2:
						__e1000_shutdown at drivers/net/ethernet/intel/e1000/e1000_main.c:5162
[   45.027344]  ? rcu_perf_cleanup+0x2a1/0x2a1:
						rcu_perf_shutdown at kernel/rcu/rcuperf.c:627
[   45.027883]  e1000_shutdown+0x14/0x3a:
						e1000_shutdown at drivers/net/ethernet/intel/e1000/e1000_main.c:5235
[   45.028351]  device_shutdown+0x110/0x1aa:
						device_shutdown at drivers/base/core.c:2807
[   45.028858]  kernel_power_off+0x31/0x64:
						kernel_power_off at kernel/reboot.c:260
[   45.029343]  rcu_perf_shutdown+0x9b/0xa7:
						rcu_perf_shutdown at kernel/rcu/rcuperf.c:637
[   45.029852]  ? __wake_up_common_lock+0xa2/0xa2:
						autoremove_wake_function at kernel/sched/wait.c:376
[   45.030414]  kthread+0x126/0x12e:
						kthread at kernel/kthread.c:233
[   45.030834]  ? __kthread_bind_mask+0x8e/0x8e:
						kthread at kernel/kthread.c:190
[   45.031399]  ? ret_from_fork+0x1f/0x30:
						ret_from_fork at arch/x86/entry/entry_64.S:443
[   45.031883]  ? kernel_init+0xa/0xf5:
						kernel_init at init/main.c:997
[   45.032325]  ret_from_fork+0x1f/0x30:
						ret_from_fork at arch/x86/entry/entry_64.S:443
[   45.032777] Code: 00 48 85 ed 75 07 48 8b ab a8 00 00 00 48 8d bb 98 00 00 00 e8 aa d1 11 00 48 89 ea 48 89 c6 48 c7 c7 d8 e4 0b 82 e8 55 7d da ff <0f> ff b9 01 00 00 00 31 d2 be 01 00 00 00 48 c7 c7 f0 b1 61 82
[   45.035222] ---[ end trace c257137b1b1976ef ]---
[   45.037838] ACPI: Preparing to enter system sleep state S5

Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000.h      |  3 ++-
 drivers/net/ethernet/intel/e1000/e1000_main.c | 27 +++++++++++++++----
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index d7bdea79e9fa..8fd2458060a0 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -331,7 +331,8 @@ struct e1000_adapter {
 enum e1000_state_t {
 	__E1000_TESTING,
 	__E1000_RESETTING,
-	__E1000_DOWN
+	__E1000_DOWN,
+	__E1000_DISABLED
 };
 
 #undef pr_fmt
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 1982f7917a8d..3dd4aeb2706d 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -945,7 +945,7 @@ static int e1000_init_hw_struct(struct e1000_adapter *adapter,
 static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct net_device *netdev;
-	struct e1000_adapter *adapter;
+	struct e1000_adapter *adapter = NULL;
 	struct e1000_hw *hw;
 
 	static int cards_found;
@@ -955,6 +955,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	u16 tmp = 0;
 	u16 eeprom_apme_mask = E1000_EEPROM_APME;
 	int bars, need_ioport;
+	bool disable_dev = false;
 
 	/* do not allocate ioport bars when not needed */
 	need_ioport = e1000_is_need_ioport(pdev);
@@ -1259,11 +1260,13 @@ err_mdio_ioremap:
 	iounmap(hw->ce4100_gbe_mdio_base_virt);
 	iounmap(hw->hw_addr);
 err_ioremap:
+	disable_dev = !test_and_set_bit(__E1000_DISABLED, &adapter->flags);
 	free_netdev(netdev);
 err_alloc_etherdev:
 	pci_release_selected_regions(pdev, bars);
 err_pci_reg:
-	pci_disable_device(pdev);
+	if (!adapter || disable_dev)
+		pci_disable_device(pdev);
 	return err;
 }
 
@@ -1281,6 +1284,7 @@ static void e1000_remove(struct pci_dev *pdev)
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
+	bool disable_dev;
 
 	e1000_down_and_stop(adapter);
 	e1000_release_manageability(adapter);
@@ -1299,9 +1303,11 @@ static void e1000_remove(struct pci_dev *pdev)
 		iounmap(hw->flash_address);
 	pci_release_selected_regions(pdev, adapter->bars);
 
+	disable_dev = !test_and_set_bit(__E1000_DISABLED, &adapter->flags);
 	free_netdev(netdev);
 
-	pci_disable_device(pdev);
+	if (disable_dev)
+		pci_disable_device(pdev);
 }
 
 /**
@@ -5156,7 +5162,8 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake)
 	if (netif_running(netdev))
 		e1000_free_irq(adapter);
 
-	pci_disable_device(pdev);
+	if (!test_and_set_bit(__E1000_DISABLED, &adapter->flags))
+		pci_disable_device(pdev);
 
 	return 0;
 }
@@ -5200,6 +5207,10 @@ static int e1000_resume(struct pci_dev *pdev)
 		pr_err("Cannot enable PCI device from suspend\n");
 		return err;
 	}
+
+	/* flush memory to make sure state is correct */
+	smp_mb__before_atomic();
+	clear_bit(__E1000_DISABLED, &adapter->flags);
 	pci_set_master(pdev);
 
 	pci_enable_wake(pdev, PCI_D3hot, 0);
@@ -5274,7 +5285,9 @@ static pci_ers_result_t e1000_io_error_detected(struct pci_dev *pdev,
 
 	if (netif_running(netdev))
 		e1000_down(adapter);
-	pci_disable_device(pdev);
+
+	if (!test_and_set_bit(__E1000_DISABLED, &adapter->flags))
+		pci_disable_device(pdev);
 
 	/* Request a slot slot reset. */
 	return PCI_ERS_RESULT_NEED_RESET;
@@ -5302,6 +5315,10 @@ static pci_ers_result_t e1000_io_slot_reset(struct pci_dev *pdev)
 		pr_err("Cannot re-enable PCI device after reset.\n");
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
+
+	/* flush memory to make sure state is correct */
+	smp_mb__before_atomic();
+	clear_bit(__E1000_DISABLED, &adapter->flags);
 	pci_set_master(pdev);
 
 	pci_enable_wake(pdev, PCI_D3hot, 0);

From 4110e02eb45ea447ec6f5459c9934de0a273fb91 Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@suse.com>
Date: Mon, 11 Dec 2017 16:26:40 +0900
Subject: [PATCH 215/305] e1000e: Fix e1000_check_for_copper_link_ich8lan
 return value.

e1000e_check_for_copper_link() and e1000_check_for_copper_link_ich8lan()
are the two functions that may be assigned to mac.ops.check_for_link when
phy.media_type == e1000_media_type_copper. Commit 19110cfbb34d ("e1000e:
Separate signaling for link check/link up") changed the meaning of the
return value of check_for_link for copper media but only adjusted the first
function. This patch adjusts the second function likewise.

Reported-by: Christian Hesse <list@eworm.de>
Reported-by: Gabriel C <nix.or.die@gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=198047
Fixes: 19110cfbb34d ("e1000e: Separate signaling for link check/link up")
Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Tested-by: Christian Hesse <list@eworm.de>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/ich8lan.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c
index d6d4ed7acf03..31277d3bb7dc 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
@@ -1367,6 +1367,9 @@ out:
  *  Checks to see of the link status of the hardware has changed.  If a
  *  change in link status has been detected, then we read the PHY registers
  *  to get the current speed/duplex if link exists.
+ *
+ *  Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link
+ *  up).
  **/
 static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 {
@@ -1382,7 +1385,7 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 	 * Change or Rx Sequence Error interrupt.
 	 */
 	if (!mac->get_link_status)
-		return 0;
+		return 1;
 
 	/* First we want to see if the MII Status Register reports
 	 * link.  If so, then we want to get the current speed/duplex
@@ -1613,10 +1616,12 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 	 * different link partner.
 	 */
 	ret_val = e1000e_config_fc_after_link_up(hw);
-	if (ret_val)
+	if (ret_val) {
 		e_dbg("Error configuring flow control\n");
+		return ret_val;
+	}
 
-	return ret_val;
+	return 1;
 }
 
 static s32 e1000_get_variants_ich8lan(struct e1000_adapter *adapter)

From bd30ffc414e55194ed6149fad69a145550cb7c18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZ=20Lin=20=28=E6=9E=97=E4=B8=8A=E6=99=BA=29?=
 <sz.lin@moxa.com>
Date: Fri, 29 Dec 2017 17:02:17 +0800
Subject: [PATCH 216/305] NET: usb: qmi_wwan: add support for YUGA CLM920-NC5
 PID 0x9625
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds support for PID 0x9625 of YUGA CLM920-NC5.

YUGA CLM920-NC5 needs to enable QMI_WWAN_QUIRK_DTR before QMI operation.

qmicli -d /dev/cdc-wdm0 -p --dms-get-revision
[/dev/cdc-wdm0] Device revision retrieved:
        Revision: 'CLM920_NC5-V1  1  [Oct 23 2016 19:00:00]'

Signed-off-by: SZ Lin (林上智) <sz.lin@moxa.com>
Acked-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 3000ddd1c7e2..728819feab44 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1100,6 +1100,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x05c6, 0x9084, 4)},
 	{QMI_FIXED_INTF(0x05c6, 0x920d, 0)},
 	{QMI_FIXED_INTF(0x05c6, 0x920d, 5)},
+	{QMI_QUIRK_SET_DTR(0x05c6, 0x9625, 4)},	/* YUGA CLM920-NC5 */
 	{QMI_FIXED_INTF(0x0846, 0x68a2, 8)},
 	{QMI_FIXED_INTF(0x12d1, 0x140c, 1)},	/* Huawei E173 */
 	{QMI_FIXED_INTF(0x12d1, 0x14ac, 1)},	/* Huawei E1820 */

From 807fc072991861ff0cd7ac44267ff1dd76ef316e Mon Sep 17 00:00:00 2001
From: Yue Hin Lau <Yuehin.Lau@amd.com>
Date: Fri, 29 Dec 2017 11:11:18 +0000
Subject: [PATCH 217/305] drm/amd/display: call set csc_default if enable
 adjustment is false

Fixes a greenish tint on RV displays.

Signed-off-by: Yue Hin Lau <Yuehin.Lau@amd.com>
Reviewed-by: Eric Bernstein <Eric.Bernstein@amd.com>
Acked-by: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
[drake@endlessm.com: backport to 4.15]
Signed-off-by: Daniel Drake <drake@endlessm.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h          | 2 +-
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c       | 6 ++----
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c | 2 ++
 drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h               | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h
index a9782b1aba47..34daf895f848 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h
@@ -1360,7 +1360,7 @@ void dpp1_cm_set_output_csc_adjustment(
 
 void dpp1_cm_set_output_csc_default(
 		struct dpp *dpp_base,
-		const struct default_adjustment *default_adjust);
+		enum dc_color_space colorspace);
 
 void dpp1_cm_set_gamut_remap(
 	struct dpp *dpp,
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c
index 40627c244bf5..ed1216b53465 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c
@@ -225,14 +225,13 @@ void dpp1_cm_set_gamut_remap(
 
 void dpp1_cm_set_output_csc_default(
 		struct dpp *dpp_base,
-		const struct default_adjustment *default_adjust)
+		enum dc_color_space colorspace)
 {
 
 	struct dcn10_dpp *dpp = TO_DCN10_DPP(dpp_base);
 	uint32_t ocsc_mode = 0;
 
-	if (default_adjust != NULL) {
-		switch (default_adjust->out_color_space) {
+	switch (colorspace) {
 		case COLOR_SPACE_SRGB:
 		case COLOR_SPACE_2020_RGB_FULLRANGE:
 			ocsc_mode = 0;
@@ -253,7 +252,6 @@ void dpp1_cm_set_output_csc_default(
 		case COLOR_SPACE_UNKNOWN:
 		default:
 			break;
-		}
 	}
 
 	REG_SET(CM_OCSC_CONTROL, 0, CM_OCSC_MODE, ocsc_mode);
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
index 961ad5c3b454..05dc01e54531 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
@@ -2097,6 +2097,8 @@ static void program_csc_matrix(struct pipe_ctx *pipe_ctx,
 			tbl_entry.color_space = color_space;
 			//tbl_entry.regval = matrix;
 			pipe_ctx->plane_res.dpp->funcs->opp_set_csc_adjustment(pipe_ctx->plane_res.dpp, &tbl_entry);
+	} else {
+		pipe_ctx->plane_res.dpp->funcs->opp_set_csc_default(pipe_ctx->plane_res.dpp, colorspace);
 	}
 }
 static bool is_lower_pipe_tree_visible(struct pipe_ctx *pipe_ctx)
diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
index 83a68460edcd..9420dfb94d39 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
@@ -64,7 +64,7 @@ struct dpp_funcs {
 
 	void (*opp_set_csc_default)(
 		struct dpp *dpp,
-		const struct default_adjustment *default_adjust);
+		enum dc_color_space colorspace);
 
 	void (*opp_set_csc_adjustment)(
 		struct dpp *dpp,

From 19d859a7205bc59ffc38303eb25ae394f61d21dc Mon Sep 17 00:00:00 2001
From: Xiongwei Song <sxwjean@gmail.com>
Date: Tue, 2 Jan 2018 21:24:55 +0800
Subject: [PATCH 218/305] drm/ttm: check the return value of kzalloc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the function ttm_page_alloc_init, kzalloc call is made for variable
_manager, we need to check its return value, it may return NULL.

Signed-off-by: Xiongwei Song <sxwjean@gmail.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/ttm/ttm_page_alloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index b5ba6441489f..5d252fb27a82 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -1007,6 +1007,8 @@ int ttm_page_alloc_init(struct ttm_mem_global *glob, unsigned max_pages)
 	pr_info("Initializing pool allocator\n");
 
 	_manager = kzalloc(sizeof(*_manager), GFP_KERNEL);
+	if (!_manager)
+		return -ENOMEM;
 
 	ttm_page_pool_init_locked(&_manager->wc_pool, GFP_HIGHUSER, "wc", 0);
 

From 0ae60d0c4f191c4241377cc3fc5931dc90ca3bbd Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:40:21 +0100
Subject: [PATCH 219/305] parisc: Show unhashed hardware inventory

Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/drivers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
index d8f77358e2ba..29b99b8964aa 100644
--- a/arch/parisc/kernel/drivers.c
+++ b/arch/parisc/kernel/drivers.c
@@ -870,7 +870,7 @@ static void print_parisc_device(struct parisc_device *dev)
 	static int count;
 
 	print_pa_hwpath(dev, hw_path);
-	printk(KERN_INFO "%d. %s at 0x%p [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }",
+	printk(KERN_INFO "%d. %s at 0x%px [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }",
 		++count, dev->name, (void*) dev->hpa.start, hw_path, dev->id.hw_type,
 		dev->id.hversion_rev, dev->id.hversion, dev->id.sversion);
 

From 63b2c373137b16d948b08cffacc6abfcf4cffea6 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:42:59 +0100
Subject: [PATCH 220/305] parisc: Show initial kernel memory layout unhashed

Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/mm/init.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 13f7854e0d49..48f41399fc0b 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -631,11 +631,11 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 #ifdef CONFIG_DEBUG_KERNEL /* double-sanity-check paranoia */
 	printk("virtual kernel memory layout:\n"
-	       "    vmalloc : 0x%p - 0x%p   (%4ld MB)\n"
-	       "    memory  : 0x%p - 0x%p   (%4ld MB)\n"
-	       "      .init : 0x%p - 0x%p   (%4ld kB)\n"
-	       "      .data : 0x%p - 0x%p   (%4ld kB)\n"
-	       "      .text : 0x%p - 0x%p   (%4ld kB)\n",
+	       "    vmalloc : 0x%px - 0x%px   (%4ld MB)\n"
+	       "    memory  : 0x%px - 0x%px   (%4ld MB)\n"
+	       "      .init : 0x%px - 0x%px   (%4ld kB)\n"
+	       "      .data : 0x%px - 0x%px   (%4ld kB)\n"
+	       "      .text : 0x%px - 0x%px   (%4ld kB)\n",
 
 	       (void*)VMALLOC_START, (void*)VMALLOC_END,
 	       (VMALLOC_END - VMALLOC_START) >> 20,

From 04903c06b4854d2e85f6e3c368d5d48c4ce55f09 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:45:42 +0100
Subject: [PATCH 221/305] parisc: Show unhashed HPA of Dino chip

Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 drivers/parisc/dino.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c
index 0b3fb99d9b89..7390fb8ca9d1 100644
--- a/drivers/parisc/dino.c
+++ b/drivers/parisc/dino.c
@@ -303,7 +303,7 @@ static void dino_mask_irq(struct irq_data *d)
 	struct dino_device *dino_dev = irq_data_get_irq_chip_data(d);
 	int local_irq = gsc_find_local_irq(d->irq, dino_dev->global_irq, DINO_LOCAL_IRQS);
 
-	DBG(KERN_WARNING "%s(0x%p, %d)\n", __func__, dino_dev, d->irq);
+	DBG(KERN_WARNING "%s(0x%px, %d)\n", __func__, dino_dev, d->irq);
 
 	/* Clear the matching bit in the IMR register */
 	dino_dev->imr &= ~(DINO_MASK_IRQ(local_irq));
@@ -316,7 +316,7 @@ static void dino_unmask_irq(struct irq_data *d)
 	int local_irq = gsc_find_local_irq(d->irq, dino_dev->global_irq, DINO_LOCAL_IRQS);
 	u32 tmp;
 
-	DBG(KERN_WARNING "%s(0x%p, %d)\n", __func__, dino_dev, d->irq);
+	DBG(KERN_WARNING "%s(0x%px, %d)\n", __func__, dino_dev, d->irq);
 
 	/*
 	** clear pending IRQ bits
@@ -396,7 +396,7 @@ ilr_again:
 	if (mask) {
 		if (--ilr_loop > 0)
 			goto ilr_again;
-		printk(KERN_ERR "Dino 0x%p: stuck interrupt %d\n", 
+		printk(KERN_ERR "Dino 0x%px: stuck interrupt %d\n",
 		       dino_dev->hba.base_addr, mask);
 		return IRQ_NONE;
 	}
@@ -553,7 +553,7 @@ dino_fixup_bus(struct pci_bus *bus)
         struct pci_dev *dev;
         struct dino_device *dino_dev = DINO_DEV(parisc_walk_tree(bus->bridge));
 
-	DBG(KERN_WARNING "%s(0x%p) bus %d platform_data 0x%p\n",
+	DBG(KERN_WARNING "%s(0x%px) bus %d platform_data 0x%px\n",
 	    __func__, bus, bus->busn_res.start,
 	    bus->bridge->platform_data);
 
@@ -854,7 +854,7 @@ static int __init dino_common_init(struct parisc_device *dev,
 	res->flags = IORESOURCE_IO; /* do not mark it busy ! */
 	if (request_resource(&ioport_resource, res) < 0) {
 		printk(KERN_ERR "%s: request I/O Port region failed "
-		       "0x%lx/%lx (hpa 0x%p)\n",
+		       "0x%lx/%lx (hpa 0x%px)\n",
 		       name, (unsigned long)res->start, (unsigned long)res->end,
 		       dino_dev->hba.base_addr);
 		return 1;

From 28df2f83c39554d9e64cd9d2a93b8e28e24df5b7 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:47:01 +0100
Subject: [PATCH 222/305] parisc: Show unhashed EISA EEPROM address

Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 drivers/parisc/eisa_eeprom.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/parisc/eisa_eeprom.c b/drivers/parisc/eisa_eeprom.c
index 4dd9b1308128..99a80da6fd2e 100644
--- a/drivers/parisc/eisa_eeprom.c
+++ b/drivers/parisc/eisa_eeprom.c
@@ -106,7 +106,7 @@ static int __init eisa_eeprom_init(void)
 		return retval;
 	}
 
-	printk(KERN_INFO "EISA EEPROM at 0x%p\n", eisa_eeprom_addr);
+	printk(KERN_INFO "EISA EEPROM at 0x%px\n", eisa_eeprom_addr);
 	return 0;
 }
 

From f8978bd95cf92f869f3d9b34c1b699f49253b8c6 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 1 Jan 2018 13:07:15 +0200
Subject: [PATCH 223/305] RDMA/netlink: Fix locking around
 __ib_get_device_by_index

Holding locks is mandatory when calling __ib_device_get_by_index,
otherwise there are races during the list iteration with device removal.

Since the locks are static to device.c, __ib_device_get_by_index can
never be called correctly by any user out side the file.

Make the function static and provide a safe function that gets the
correct locks and returns a kref'd pointer. Fix all callers.

Fixes: e5c9469efcb1 ("RDMA/netlink: Add nldev device doit implementation")
Fixes: c3f66f7b0052 ("RDMA/netlink: Implement nldev port doit callback")
Fixes: 7d02f605f0dc ("RDMA/netlink: Add nldev port dumpit implementation")
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/core_priv.h |  2 +-
 drivers/infiniband/core/device.c    | 18 +++++++++-
 drivers/infiniband/core/nldev.c     | 54 +++++++++++++++++++----------
 3 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index a1d687a664f8..66f0268f37a6 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -314,7 +314,7 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
 }
 #endif
 
-struct ib_device *__ib_device_get_by_index(u32 ifindex);
+struct ib_device *ib_device_get_by_index(u32 ifindex);
 /* RDMA device netlink */
 void nldev_init(void);
 void nldev_exit(void);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 30914f3baa5f..465520627e4b 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -134,7 +134,7 @@ static int ib_device_check_mandatory(struct ib_device *device)
 	return 0;
 }
 
-struct ib_device *__ib_device_get_by_index(u32 index)
+static struct ib_device *__ib_device_get_by_index(u32 index)
 {
 	struct ib_device *device;
 
@@ -145,6 +145,22 @@ struct ib_device *__ib_device_get_by_index(u32 index)
 	return NULL;
 }
 
+/*
+ * Caller is responsible to return refrerence count by calling put_device()
+ */
+struct ib_device *ib_device_get_by_index(u32 index)
+{
+	struct ib_device *device;
+
+	down_read(&lists_rwsem);
+	device = __ib_device_get_by_index(index);
+	if (device)
+		get_device(&device->dev);
+
+	up_read(&lists_rwsem);
+	return device;
+}
+
 static struct ib_device *__ib_device_get_by_name(const char *name)
 {
 	struct ib_device *device;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 9a05245a1acf..0dcd1aa6f683 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -142,27 +142,34 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
 
-	device = __ib_device_get_by_index(index);
+	device = ib_device_get_by_index(index);
 	if (!device)
 		return -EINVAL;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
+	if (!msg) {
+		err = -ENOMEM;
+		goto err;
+	}
 
 	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
 			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
 			0, 0);
 
 	err = fill_dev_info(msg, device);
-	if (err) {
-		nlmsg_free(msg);
-		return err;
-	}
+	if (err)
+		goto err_free;
 
 	nlmsg_end(msg, nlh);
 
+	put_device(&device->dev);
 	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+	nlmsg_free(msg);
+err:
+	put_device(&device->dev);
+	return err;
 }
 
 static int _nldev_get_dumpit(struct ib_device *device,
@@ -220,31 +227,40 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -EINVAL;
 
 	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-	device = __ib_device_get_by_index(index);
+	device = ib_device_get_by_index(index);
 	if (!device)
 		return -EINVAL;
 
 	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
-	if (!rdma_is_port_valid(device, port))
-		return -EINVAL;
+	if (!rdma_is_port_valid(device, port)) {
+		err = -EINVAL;
+		goto err;
+	}
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
+	if (!msg) {
+		err = -ENOMEM;
+		goto err;
+	}
 
 	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
 			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
 			0, 0);
 
 	err = fill_port_info(msg, device, port);
-	if (err) {
-		nlmsg_free(msg);
-		return err;
-	}
+	if (err)
+		goto err_free;
 
 	nlmsg_end(msg, nlh);
+	put_device(&device->dev);
 
 	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+	nlmsg_free(msg);
+err:
+	put_device(&device->dev);
+	return err;
 }
 
 static int nldev_port_get_dumpit(struct sk_buff *skb,
@@ -265,7 +281,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
 		return -EINVAL;
 
 	ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-	device = __ib_device_get_by_index(ifindex);
+	device = ib_device_get_by_index(ifindex);
 	if (!device)
 		return -EINVAL;
 
@@ -299,7 +315,9 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
 		nlmsg_end(skb, nlh);
 	}
 
-out:	cb->args[0] = idx;
+out:
+	put_device(&device->dev);
+	cb->args[0] = idx;
 	return skb->len;
 }
 

From 88776c0e70be0290f8357019d844aae15edaa967 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:36:44 +0100
Subject: [PATCH 224/305] parisc: Fix alignment of pa_tlb_lock in assembly on
 32-bit SMP kernel

Qemu for PARISC reported on a 32bit SMP parisc kernel strange failures
about "Not-handled unaligned insn 0x0e8011d6 and 0x0c2011c9."

Those opcodes evaluate to the ldcw() assembly instruction which requires
(on 32bit) an alignment of 16 bytes to ensure atomicity.

As it turns out, qemu is correct and in our assembly code in entry.S and
pacache.S we don't pay attention to the required alignment.

This patch fixes the problem by aligning the lock offset in assembly
code in the same manner as we do in our C-code.

Signed-off-by: Helge Deller <deller@gmx.de>
Cc: <stable@vger.kernel.org> # v4.0+
---
 arch/parisc/include/asm/ldcw.h |  2 ++
 arch/parisc/kernel/entry.S     | 13 +++++++++++--
 arch/parisc/kernel/pacache.S   |  9 +++++++--
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h
index dd5a08aaa4da..3eb4bfc1fb36 100644
--- a/arch/parisc/include/asm/ldcw.h
+++ b/arch/parisc/include/asm/ldcw.h
@@ -12,6 +12,7 @@
    for the semaphore.  */
 
 #define __PA_LDCW_ALIGNMENT	16
+#define __PA_LDCW_ALIGN_ORDER	4
 #define __ldcw_align(a) ({					\
 	unsigned long __ret = (unsigned long) &(a)->lock[0];	\
 	__ret = (__ret + __PA_LDCW_ALIGNMENT - 1)		\
@@ -29,6 +30,7 @@
    ldcd). */
 
 #define __PA_LDCW_ALIGNMENT	4
+#define __PA_LDCW_ALIGN_ORDER	2
 #define __ldcw_align(a) (&(a)->slock)
 #define __LDCW	"ldcw,co"
 
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index f3cecf5117cf..e95207c0565e 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -35,6 +35,7 @@
 #include <asm/pgtable.h>
 #include <asm/signal.h>
 #include <asm/unistd.h>
+#include <asm/ldcw.h>
 #include <asm/thread_info.h>
 
 #include <linux/linkage.h>
@@ -46,6 +47,14 @@
 #endif
 
 	.import		pa_tlb_lock,data
+	.macro  load_pa_tlb_lock reg
+#if __PA_LDCW_ALIGNMENT > 4
+	load32	PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg
+	depi	0,31,__PA_LDCW_ALIGN_ORDER, \reg
+#else
+	load32	PA(pa_tlb_lock), \reg
+#endif
+	.endm
 
 	/* space_to_prot macro creates a prot id from a space id */
 
@@ -457,7 +466,7 @@
 	.macro		tlb_lock	spc,ptp,pte,tmp,tmp1,fault
 #ifdef CONFIG_SMP
 	cmpib,COND(=),n	0,\spc,2f
-	load32		PA(pa_tlb_lock),\tmp
+	load_pa_tlb_lock \tmp
 1:	LDCW		0(\tmp),\tmp1
 	cmpib,COND(=)	0,\tmp1,1b
 	nop
@@ -480,7 +489,7 @@
 	/* Release pa_tlb_lock lock. */
 	.macro		tlb_unlock1	spc,tmp
 #ifdef CONFIG_SMP
-	load32		PA(pa_tlb_lock),\tmp
+	load_pa_tlb_lock \tmp
 	tlb_unlock0	\spc,\tmp
 #endif
 	.endm
diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S
index adf7187f8951..2d40c4ff3f69 100644
--- a/arch/parisc/kernel/pacache.S
+++ b/arch/parisc/kernel/pacache.S
@@ -36,6 +36,7 @@
 #include <asm/assembly.h>
 #include <asm/pgtable.h>
 #include <asm/cache.h>
+#include <asm/ldcw.h>
 #include <linux/linkage.h>
 
 	.text
@@ -333,8 +334,12 @@ ENDPROC_CFI(flush_data_cache_local)
 
 	.macro	tlb_lock	la,flags,tmp
 #ifdef CONFIG_SMP
-	ldil		L%pa_tlb_lock,%r1
-	ldo		R%pa_tlb_lock(%r1),\la
+#if __PA_LDCW_ALIGNMENT > 4
+	load32		pa_tlb_lock + __PA_LDCW_ALIGNMENT-1, \la
+	depi		0,31,__PA_LDCW_ALIGN_ORDER, \la
+#else
+	load32		pa_tlb_lock, \la
+#endif
 	rsm		PSW_SM_I,\flags
 1:	LDCW		0(\la),\tmp
 	cmpib,<>,n	0,\tmp,3f

From 71891e2dab6b55a870f8f7735e44a2963860b5c6 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 29 Dec 2017 10:02:52 -0800
Subject: [PATCH 225/305] ethtool: do not print warning for applications using
 legacy API

In kernel log ths message appears on every boot:
 "warning: `NetworkChangeNo' uses legacy ethtool link settings API,
  link modes are only partially reported"

When ethtool link settings API changed, it started complaining about
usages of old API. Ironically, the original patch was from google but
the application using the legacy API is chrome.

Linux ABI is fixed as much as possible. The kernel must not break it
and should not complain about applications using legacy API's.
This patch just removes the warning since using legacy API's
in Linux is perfectly acceptable.

Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f8fcf450a36e..8225416911ae 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -770,15 +770,6 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
 	return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
 }
 
-static void
-warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
-{
-	char name[sizeof(current->comm)];
-
-	pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n",
-		     get_task_comm(name, current), details);
-}
-
 /* Query device for its ethtool_cmd settings.
  *
  * Backward compatibility note: for compatibility with legacy ethtool,
@@ -805,10 +796,8 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 							   &link_ksettings);
 		if (err < 0)
 			return err;
-		if (!convert_link_ksettings_to_legacy_settings(&cmd,
-							       &link_ksettings))
-			warn_incomplete_ethtool_legacy_settings_conversion(
-				"link modes are only partially reported");
+		convert_link_ksettings_to_legacy_settings(&cmd,
+							  &link_ksettings);
 
 		/* send a sensible cmd tag back to user */
 		cmd.cmd = ETHTOOL_GSET;

From f9c935db8086231a35b7f5c2a53e3f1e10f388ee Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 29 Dec 2017 19:48:02 +0100
Subject: [PATCH 226/305] tipc: fix problems with multipoint-to-point flow
 control

In commit 04d7b574b245 ("tipc: add multipoint-to-point flow control") we
introduced a protocol for preventing buffer overflow when many group
members try to simultaneously send messages to the same receiving member.

Stress test of this mechanism has revealed a couple of related bugs:

- When the receiving member receives an advertisement REMIT message from
  one of the senders, it will sometimes prematurely activate a pending
  member and send it the remitted advertisement, although the upper
  limit for active senders has been reached. This leads to accumulation
  of illegal advertisements, and eventually to messages being dropped
  because of receive buffer overflow.

- When the receiving member leaves REMITTED state while a received
  message is being read, we miss to look at the pending queue, to
  activate the oldest pending peer. This leads to some pending senders
  being starved out, and never getting the opportunity to profit from
  the remitted advertisement.

We fix the former in the function tipc_group_proto_rcv() by returning
directly from the function once it becomes clear that the remitting
peer cannot leave REMITTED state at that point.

We fix the latter in the function tipc_group_update_rcv_win() by looking
up and activate the longest pending peer when it becomes clear that the
remitting peer now can leave REMITTED state.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 8e12ab55346b..5f4ffae807ee 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -109,7 +109,8 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
 static void tipc_group_decr_active(struct tipc_group *grp,
 				   struct tipc_member *m)
 {
-	if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING)
+	if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING ||
+	    m->state == MBR_REMITTED)
 		grp->active_cnt--;
 }
 
@@ -562,7 +563,7 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 	int max_active = grp->max_active;
 	int reclaim_limit = max_active * 3 / 4;
 	int active_cnt = grp->active_cnt;
-	struct tipc_member *m, *rm;
+	struct tipc_member *m, *rm, *pm;
 
 	m = tipc_group_find_member(grp, node, port);
 	if (!m)
@@ -605,6 +606,17 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 			pr_warn_ratelimited("Rcv unexpected msg after REMIT\n");
 			tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
 		}
+		grp->active_cnt--;
+		list_del_init(&m->list);
+		if (list_empty(&grp->pending))
+			return;
+
+		/* Set oldest pending member to active and advertise */
+		pm = list_first_entry(&grp->pending, struct tipc_member, list);
+		pm->state = MBR_ACTIVE;
+		list_move_tail(&pm->list, &grp->active);
+		grp->active_cnt++;
+		tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq);
 		break;
 	case MBR_RECLAIMING:
 	case MBR_DISCOVERED:
@@ -742,14 +754,14 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 		if (!m || m->state != MBR_RECLAIMING)
 			return;
 
-		list_del_init(&m->list);
-		grp->active_cnt--;
 		remitted = msg_grp_remitted(hdr);
 
 		/* Messages preceding the REMIT still in receive queue */
 		if (m->advertised > remitted) {
 			m->state = MBR_REMITTED;
 			in_flight = m->advertised - remitted;
+			m->advertised = ADV_IDLE + in_flight;
+			return;
 		}
 		/* All messages preceding the REMIT have been read */
 		if (m->advertised <= remitted) {
@@ -761,6 +773,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 			tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
 
 		m->advertised = ADV_IDLE + in_flight;
+		grp->active_cnt--;
+		list_del_init(&m->list);
 
 		/* Set oldest pending member to active and advertise */
 		if (list_empty(&grp->pending))

From af1be2e21203867cb958aaceed5366e2e24b88e8 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Fri, 8 Dec 2017 08:45:57 -0800
Subject: [PATCH 227/305] ARC: handle gcc generated __builtin_trap for older
 compiler

ARC gcc prior to GNU 2018.03 release didn't have a target specific
__builtin_trap() implementation, generating default abort() call.

Implement the abort() call - emulating what newer gcc does for the same,
as suggested by Arnd.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/kernel/traps.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c
index 004f4e4a4c10..133a4dae41fe 100644
--- a/arch/arc/kernel/traps.c
+++ b/arch/arc/kernel/traps.c
@@ -161,3 +161,11 @@ void do_insterror_or_kprobe(unsigned long address, struct pt_regs *regs)
 
 	insterror_is_error(address, regs);
 }
+
+/*
+ * abort() call generated by older gcc for __builtin_trap()
+ */
+void abort(void)
+{
+	__asm__ __volatile__("trap_s  5\n");
+}

From 835bcec5fdf3f9e880111b482177e7e70e3596da Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Tue, 2 Jan 2018 17:21:09 +0000
Subject: [PATCH 228/305] x86/efi: Fix kernel param add_efi_memmap regression

'add_efi_memmap' is an early param, but do_add_efi_memmap() has no
chance to run because the code path is before parse_early_param().
I believe it worked when the param was introduced but probably later
some other changes caused the wrong order and nobody noticed it.

Move efi_memblock_x86_reserve_range() after parse_early_param()
to fix it.

Signed-off-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Ge Song <ge.song@hxt-semitech.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180102172110.17018-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/setup.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8af2e8d0c0a1..145810b0edf6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -906,9 +906,6 @@ void __init setup_arch(char **cmdline_p)
 		set_bit(EFI_BOOT, &efi.flags);
 		set_bit(EFI_64BIT, &efi.flags);
 	}
-
-	if (efi_enabled(EFI_BOOT))
-		efi_memblock_x86_reserve_range();
 #endif
 
 	x86_init.oem.arch_setup();
@@ -962,6 +959,8 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+	if (efi_enabled(EFI_BOOT))
+		efi_memblock_x86_reserve_range();
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/*
 	 * Memory used by the kernel cannot be hot-removed because Linux

From f24c4d478013d82bd1b943df566fff3561d52864 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 2 Jan 2018 17:21:10 +0000
Subject: [PATCH 229/305] efi/capsule-loader: Reinstate virtual capsule mapping

Commit:

  82c3768b8d68 ("efi/capsule-loader: Use a cached copy of the capsule header")

... refactored the capsule loading code that maps the capsule header,
to avoid having to map it several times.

However, as it turns out, the vmap() call we ended up removing did not
just map the header, but the entire capsule image, and dropping this
virtual mapping breaks capsules that are processed by the firmware
immediately (i.e., without a reboot).

Unfortunately, that change was part of a larger refactor that allowed
a quirk to be implemented for Quark, which has a non-standard memory
layout for capsules, and we have slightly painted ourselves into a
corner by allowing quirk code to mangle the capsule header and memory
layout.

So we need to fix this without breaking Quark. Fortunately, Quark does
not appear to care about the virtual mapping, and so we can simply
do a partial revert of commit:

  2a457fb31df6 ("efi/capsule-loader: Use page addresses rather than struct page pointers")

... and create a vmap() mapping of the entire capsule (including header)
based on the reinstated struct page array, unless running on Quark, in
which case we pass the capsule header copy as before.

Reported-by: Ge Song <ge.song@hxt-semitech.com>
Tested-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Tested-by: Ge Song <ge.song@hxt-semitech.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: <stable@vger.kernel.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Fixes: 82c3768b8d68 ("efi/capsule-loader: Use a cached copy of the capsule header")
Link: http://lkml.kernel.org/r/20180102172110.17018-3-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/quirks.c        | 13 +++++++-
 drivers/firmware/efi/capsule-loader.c | 45 ++++++++++++++++++++++-----
 include/linux/efi.h                   |  4 ++-
 3 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 8a99a2e96537..5b513ccffde4 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -592,7 +592,18 @@ static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff,
 	/*
 	 * Update the first page pointer to skip over the CSH header.
 	 */
-	cap_info->pages[0] += csh->headersize;
+	cap_info->phys[0] += csh->headersize;
+
+	/*
+	 * cap_info->capsule should point at a virtual mapping of the entire
+	 * capsule, starting at the capsule header. Our image has the Quark
+	 * security header prepended, so we cannot rely on the default vmap()
+	 * mapping created by the generic capsule code.
+	 * Given that the Quark firmware does not appear to care about the
+	 * virtual mapping, let's just point cap_info->capsule at our copy
+	 * of the capsule header.
+	 */
+	cap_info->capsule = &cap_info->header;
 
 	return 1;
 }
diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c
index ec8ac5c4dd84..055e2e8f985a 100644
--- a/drivers/firmware/efi/capsule-loader.c
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -20,10 +20,6 @@
 
 #define NO_FURTHER_WRITE_ACTION -1
 
-#ifndef phys_to_page
-#define phys_to_page(x)		pfn_to_page((x) >> PAGE_SHIFT)
-#endif
-
 /**
  * efi_free_all_buff_pages - free all previous allocated buffer pages
  * @cap_info: pointer to current instance of capsule_info structure
@@ -35,7 +31,7 @@
 static void efi_free_all_buff_pages(struct capsule_info *cap_info)
 {
 	while (cap_info->index > 0)
-		__free_page(phys_to_page(cap_info->pages[--cap_info->index]));
+		__free_page(cap_info->pages[--cap_info->index]);
 
 	cap_info->index = NO_FURTHER_WRITE_ACTION;
 }
@@ -71,6 +67,14 @@ int __efi_capsule_setup_info(struct capsule_info *cap_info)
 
 	cap_info->pages = temp_page;
 
+	temp_page = krealloc(cap_info->phys,
+			     pages_needed * sizeof(phys_addr_t *),
+			     GFP_KERNEL | __GFP_ZERO);
+	if (!temp_page)
+		return -ENOMEM;
+
+	cap_info->phys = temp_page;
+
 	return 0;
 }
 
@@ -105,9 +109,24 @@ int __weak efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
  **/
 static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info)
 {
+	bool do_vunmap = false;
 	int ret;
 
-	ret = efi_capsule_update(&cap_info->header, cap_info->pages);
+	/*
+	 * cap_info->capsule may have been assigned already by a quirk
+	 * handler, so only overwrite it if it is NULL
+	 */
+	if (!cap_info->capsule) {
+		cap_info->capsule = vmap(cap_info->pages, cap_info->index,
+					 VM_MAP, PAGE_KERNEL);
+		if (!cap_info->capsule)
+			return -ENOMEM;
+		do_vunmap = true;
+	}
+
+	ret = efi_capsule_update(cap_info->capsule, cap_info->phys);
+	if (do_vunmap)
+		vunmap(cap_info->capsule);
 	if (ret) {
 		pr_err("capsule update failed\n");
 		return ret;
@@ -165,10 +184,12 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff,
 			goto failed;
 		}
 
-		cap_info->pages[cap_info->index++] = page_to_phys(page);
+		cap_info->pages[cap_info->index] = page;
+		cap_info->phys[cap_info->index] = page_to_phys(page);
 		cap_info->page_bytes_remain = PAGE_SIZE;
+		cap_info->index++;
 	} else {
-		page = phys_to_page(cap_info->pages[cap_info->index - 1]);
+		page = cap_info->pages[cap_info->index - 1];
 	}
 
 	kbuff = kmap(page);
@@ -252,6 +273,7 @@ static int efi_capsule_release(struct inode *inode, struct file *file)
 	struct capsule_info *cap_info = file->private_data;
 
 	kfree(cap_info->pages);
+	kfree(cap_info->phys);
 	kfree(file->private_data);
 	file->private_data = NULL;
 	return 0;
@@ -281,6 +303,13 @@ static int efi_capsule_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	}
 
+	cap_info->phys = kzalloc(sizeof(void *), GFP_KERNEL);
+	if (!cap_info->phys) {
+		kfree(cap_info->pages);
+		kfree(cap_info);
+		return -ENOMEM;
+	}
+
 	file->private_data = cap_info;
 
 	return 0;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index d813f7b04da7..29fdf8029cf6 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -140,11 +140,13 @@ struct efi_boot_memmap {
 
 struct capsule_info {
 	efi_capsule_header_t	header;
+	efi_capsule_header_t	*capsule;
 	int			reset_type;
 	long			index;
 	size_t			count;
 	size_t			total_size;
-	phys_addr_t		*pages;
+	struct page		**pages;
+	phys_addr_t		*phys;
 	size_t			page_bytes_remain;
 };
 

From 81b60dbff04980a45b348c5b5eeca2713d4594ca Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 3 Jan 2018 09:44:17 +0000
Subject: [PATCH 230/305] MAINTAINERS: Remove Matt Fleming as EFI co-maintainer

Instate Ard Biesheuvel as the sole EFI maintainer and leave other folks
as maintainers for the EFI test driver and efivarfs file system.

Also add Ard Biesheuvel as the EFI test driver and efivarfs maintainer.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Ivan Hu <ivan.hu@canonical.com>
Cc: Jeremy Kerr <jk@ozlabs.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180103094417.6353-1-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 MAINTAINERS | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index b46c9cea5ae5..95c3fa1f520f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5149,15 +5149,15 @@ F:	sound/usb/misc/ua101.c
 EFI TEST DRIVER
 L:	linux-efi@vger.kernel.org
 M:	Ivan Hu <ivan.hu@canonical.com>
-M:	Matt Fleming <matt@codeblueprint.co.uk>
+M:	Ard Biesheuvel <ard.biesheuvel@linaro.org>
 S:	Maintained
 F:	drivers/firmware/efi/test/
 
 EFI VARIABLE FILESYSTEM
 M:	Matthew Garrett <matthew.garrett@nebula.com>
 M:	Jeremy Kerr <jk@ozlabs.org>
-M:	Matt Fleming <matt@codeblueprint.co.uk>
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/efi.git
+M:	Ard Biesheuvel <ard.biesheuvel@linaro.org>
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git
 L:	linux-efi@vger.kernel.org
 S:	Maintained
 F:	fs/efivarfs/
@@ -5318,7 +5318,6 @@ S:	Supported
 F:	security/integrity/evm/
 
 EXTENSIBLE FIRMWARE INTERFACE (EFI)
-M:	Matt Fleming <matt@codeblueprint.co.uk>
 M:	Ard Biesheuvel <ard.biesheuvel@linaro.org>
 L:	linux-efi@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git

From 87faa0d9b43b4755ff6963a22d1fd1bee1aa3b39 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Jan 2018 15:18:44 +0100
Subject: [PATCH 231/305] x86/pti: Enable PTI by default

This really want's to be enabled by default. Users who know what they are
doing can disable it either in the config or on the kernel command line.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
---
 security/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/security/Kconfig b/security/Kconfig
index a623d13bf288..3d4debd0257e 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -56,6 +56,7 @@ config SECURITY_NETWORK
 
 config PAGE_TABLE_ISOLATION
 	bool "Remove the kernel mapping in user mode"
+	default y
 	depends on X86_64 && !UML
 	help
 	  This feature reduces the number of hardware side channels by

From 694d99d40972f12e59a3696effee8a376b79d7c8 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Tue, 26 Dec 2017 23:43:54 -0600
Subject: [PATCH 232/305] x86/cpu, x86/pti: Do not enable PTI on AMD processors

AMD processors are not subject to the types of attacks that the kernel
page table isolation feature protects against.  The AMD microarchitecture
does not allow memory references, including speculative references, that
access higher privileged data when running in a lesser privileged mode
when that access would result in a page fault.

Disable page table isolation by default on AMD processors by not setting
the X86_BUG_CPU_INSECURE feature, which controls whether X86_FEATURE_PTI
is set.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20171227054354.20369.94587.stgit@tlendack-t1.amdoffice.net
---
 arch/x86/kernel/cpu/common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2a94dfb434e..b1be494ab4e8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -899,8 +899,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
-	/* Assume for now that ALL x86 CPUs are insecure */
-	setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+	if (c->x86_vendor != X86_VENDOR_AMD)
+		setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
 
 	fpu__init_system(c);
 

From 52994c256df36fda9a715697431cba9daecb6b11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Jan 2018 15:57:59 +0100
Subject: [PATCH 233/305] x86/pti: Make sure the user/kernel PTEs match

Meelis reported that his K8 Athlon64 emits MCE warnings when PTI is
enabled:

[Hardware Error]: Error Addr: 0x0000ffff81e000e0
[Hardware Error]: MC1 Error: L1 TLB multimatch.
[Hardware Error]: cache level: L1, tx: INSN

The address is in the entry area, which is mapped into kernel _AND_ user
space. That's special because we switch CR3 while we are executing
there.

User mapping:
0xffffffff81e00000-0xffffffff82000000           2M     ro         PSE     GLB x  pmd

Kernel mapping:
0xffffffff81000000-0xffffffff82000000          16M     ro         PSE         x  pmd

So the K8 is complaining that the TLB entries differ. They differ in the
GLB bit.

Drop the GLB bit when installing the user shared mapping.

Fixes: 6dc72c3cbca0 ("x86/mm/pti: Share entry text PMD")
Reported-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Meelis Roos <mroos@linux.ee>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031407180.1957@nanos
---
 arch/x86/mm/pti.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index bce8aea65606..2da28ba97508 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -367,7 +367,8 @@ static void __init pti_setup_espfix64(void)
 static void __init pti_clone_entry_text(void)
 {
 	pti_clone_pmds((unsigned long) __entry_text_start,
-			(unsigned long) __irqentry_text_end, _PAGE_RW);
+			(unsigned long) __irqentry_text_end,
+		       _PAGE_RW | _PAGE_GLOBAL);
 }
 
 /*

From a9cdbe72c4e8bf3b38781c317a79326e2e1a230d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 31 Dec 2017 10:18:06 -0600
Subject: [PATCH 234/305] x86/dumpstack: Fix partial register dumps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The show_regs_safe() logic is wrong.  When there's an iret stack frame,
it prints the entire pt_regs -- most of which is random stack data --
instead of just the five registers at the end.

show_regs_safe() is also poorly named: the on_stack() checks aren't for
safety.  Rename the function to show_regs_if_on_stack() and add a
comment to explain why the checks are needed.

These issues were introduced with the "partial register dump" feature of
the following commit:

  b02fcf9ba121 ("x86/unwinder: Handle stack overflows more gracefully")

That patch had gone through a few iterations of development, and the
above issues were artifacts from a previous iteration of the patch where
'regs' pointed directly to the iret frame rather than to the (partially
empty) pt_regs.

Tested-by: Alexander Tsoy <alexander@tsoy.me>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toralf Förster <toralf.foerster@gmx.de>
Cc: stable@vger.kernel.org
Fixes: b02fcf9ba121 ("x86/unwinder: Handle stack overflows more gracefully")
Link: http://lkml.kernel.org/r/5b05b8b344f59db2d3d50dbdeba92d60f2304c54.1514736742.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/unwind.h | 17 +++++++++++++----
 arch/x86/kernel/dumpstack.c   | 28 ++++++++++++++++++++--------
 arch/x86/kernel/stacktrace.c  |  2 +-
 3 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index c1688c2d0a12..1f86e1b0a5cd 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -56,18 +56,27 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
 
 #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
 /*
- * WARNING: The entire pt_regs may not be safe to dereference.  In some cases,
- * only the iret frame registers are accessible.  Use with caution!
+ * If 'partial' returns true, only the iret frame registers are valid.
  */
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+						    bool *partial)
 {
 	if (unwind_done(state))
 		return NULL;
 
+	if (partial) {
+#ifdef CONFIG_UNWINDER_ORC
+		*partial = !state->full_regs;
+#else
+		*partial = false;
+#endif
+	}
+
 	return state->regs;
 }
 #else
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+						    bool *partial)
 {
 	return NULL;
 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 5fa110699ed2..d0bb176a7261 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -76,12 +76,23 @@ void show_iret_regs(struct pt_regs *regs)
 		regs->sp, regs->flags);
 }
 
-static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
+static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
+				  bool partial)
 {
-	if (on_stack(info, regs, sizeof(*regs)))
+	/*
+	 * These on_stack() checks aren't strictly necessary: the unwind code
+	 * has already validated the 'regs' pointer.  The checks are done for
+	 * ordering reasons: if the registers are on the next stack, we don't
+	 * want to print them out yet.  Otherwise they'll be shown as part of
+	 * the wrong stack.  Later, when show_trace_log_lvl() switches to the
+	 * next stack, this function will be called again with the same regs so
+	 * they can be printed in the right context.
+	 */
+	if (!partial && on_stack(info, regs, sizeof(*regs))) {
 		__show_regs(regs, 0);
-	else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
-			  IRET_FRAME_SIZE)) {
+
+	} else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+				       IRET_FRAME_SIZE)) {
 		/*
 		 * When an interrupt or exception occurs in entry code, the
 		 * full pt_regs might not have been saved yet.  In that case
@@ -98,6 +109,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	struct stack_info stack_info = {0};
 	unsigned long visit_mask = 0;
 	int graph_idx = 0;
+	bool partial;
 
 	printk("%sCall Trace:\n", log_lvl);
 
@@ -140,7 +152,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			printk("%s <%s>\n", log_lvl, stack_name);
 
 		if (regs)
-			show_regs_safe(&stack_info, regs);
+			show_regs_if_on_stack(&stack_info, regs, partial);
 
 		/*
 		 * Scan the stack, printing any text addresses we find.  At the
@@ -164,7 +176,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
 			/*
 			 * Don't print regs->ip again if it was already printed
-			 * by show_regs_safe() below.
+			 * by show_regs_if_on_stack().
 			 */
 			if (regs && stack == &regs->ip)
 				goto next;
@@ -199,9 +211,9 @@ next:
 			unwind_next_frame(&state);
 
 			/* if the frame has entry regs, print them */
-			regs = unwind_get_entry_regs(&state);
+			regs = unwind_get_entry_regs(&state, &partial);
 			if (regs)
-				show_regs_safe(&stack_info, regs);
+				show_regs_if_on_stack(&stack_info, regs, partial);
 		}
 
 		if (stack_name)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 8dabd7bf1673..60244bfaf88f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -98,7 +98,7 @@ static int __save_stack_trace_reliable(struct stack_trace *trace,
 	for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);
 	     unwind_next_frame(&state)) {
 
-		regs = unwind_get_entry_regs(&state);
+		regs = unwind_get_entry_regs(&state, NULL);
 		if (regs) {
 			/*
 			 * Kernel mode registers on the stack indicate an

From 3ffdeb1a02be3086f1411a15c5b9c481fa28e21f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 31 Dec 2017 10:18:07 -0600
Subject: [PATCH 235/305] x86/dumpstack: Print registers for first stack frame
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the stack dump code, if the frame after the starting pt_regs is also
a regs frame, the registers don't get printed.  Fix that.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Tested-by: Alexander Tsoy <alexander@tsoy.me>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toralf Förster <toralf.foerster@gmx.de>
Cc: stable@vger.kernel.org
Fixes: 3b3fa11bc700 ("x86/dumpstack: Print any pt_regs found on the stack")
Link: http://lkml.kernel.org/r/396f84491d2f0ef64eda4217a2165f5712f6a115.1514736742.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/dumpstack.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index d0bb176a7261..afbecff161d1 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -115,6 +115,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
 	unwind_start(&state, task, regs, stack);
 	stack = stack ? : get_stack_pointer(task, regs);
+	regs = unwind_get_entry_regs(&state, &partial);
 
 	/*
 	 * Iterate through the stacks, starting with the current stack pointer.
@@ -132,7 +133,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	 * - hardirq stack
 	 * - entry stack
 	 */
-	for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+	for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
 		const char *stack_name;
 
 		if (get_stack_info(stack, task, &stack_info, &visit_mask)) {

From c0bace798436bca0fdc221ff61143f1376a9c3de Mon Sep 17 00:00:00 2001
From: Felix Janda <felix.janda@posteo.de>
Date: Mon, 1 Jan 2018 19:33:20 +0100
Subject: [PATCH 236/305] uapi libc compat: add fallback for unsupported libcs

libc-compat.h aims to prevent symbol collisions between uapi and libc
headers for each supported libc. This requires continuous coordination
between them.

The goal of this commit is to improve the situation for libcs (such as
musl) which are not yet supported and/or do not wish to be explicitly
supported, while not affecting supported libcs. More precisely, with
this commit, unsupported libcs can request the suppression of any
specific uapi definition by defining the correspondings _UAPI_DEF_*
macro as 0. This can fix symbol collisions for them, as long as the
libc headers are included before the uapi headers. Inclusion in the
other order is outside the scope of this commit.

All infrastructure in order to enable this fallback for unsupported
libcs is already in place, except that libc-compat.h unconditionally
defines all _UAPI_DEF_* macros to 1 for all unsupported libcs so that
any previous definitions are ignored. In order to fix this, this commit
merely makes these definitions conditional.

This commit together with the musl libc commit

http://git.musl-libc.org/cgit/musl/commit/?id=04983f2272382af92eb8f8838964ff944fbb8258

fixes for example the following compiler errors when <linux/in6.h> is
included after musl's <netinet/in.h>:

./linux/in6.h:32:8: error: redefinition of 'struct in6_addr'
./linux/in6.h:49:8: error: redefinition of 'struct sockaddr_in6'
./linux/in6.h:59:8: error: redefinition of 'struct ipv6_mreq'

The comments referencing glibc are still correct, but this file is not
only used for glibc any more.

Signed-off-by: Felix Janda <felix.janda@posteo.de>
Reviewed-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/libc-compat.h | 55 +++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
index 282875cf8056..8254c937c9f4 100644
--- a/include/uapi/linux/libc-compat.h
+++ b/include/uapi/linux/libc-compat.h
@@ -168,46 +168,99 @@
 
 /* If we did not see any headers from any supported C libraries,
  * or we are being included in the kernel, then define everything
- * that we need. */
+ * that we need. Check for previous __UAPI_* definitions to give
+ * unsupported C libraries a way to opt out of any kernel definition. */
 #else /* !defined(__GLIBC__) */
 
 /* Definitions for if.h */
+#ifndef __UAPI_DEF_IF_IFCONF
 #define __UAPI_DEF_IF_IFCONF 1
+#endif
+#ifndef __UAPI_DEF_IF_IFMAP
 #define __UAPI_DEF_IF_IFMAP 1
+#endif
+#ifndef __UAPI_DEF_IF_IFNAMSIZ
 #define __UAPI_DEF_IF_IFNAMSIZ 1
+#endif
+#ifndef __UAPI_DEF_IF_IFREQ
 #define __UAPI_DEF_IF_IFREQ 1
+#endif
 /* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */
+#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS
 #define __UAPI_DEF_IF_NET_DEVICE_FLAGS 1
+#endif
 /* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */
+#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO
 #define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1
+#endif
 
 /* Definitions for in.h */
+#ifndef __UAPI_DEF_IN_ADDR
 #define __UAPI_DEF_IN_ADDR		1
+#endif
+#ifndef __UAPI_DEF_IN_IPPROTO
 #define __UAPI_DEF_IN_IPPROTO		1
+#endif
+#ifndef __UAPI_DEF_IN_PKTINFO
 #define __UAPI_DEF_IN_PKTINFO		1
+#endif
+#ifndef __UAPI_DEF_IP_MREQ
 #define __UAPI_DEF_IP_MREQ		1
+#endif
+#ifndef __UAPI_DEF_SOCKADDR_IN
 #define __UAPI_DEF_SOCKADDR_IN		1
+#endif
+#ifndef __UAPI_DEF_IN_CLASS
 #define __UAPI_DEF_IN_CLASS		1
+#endif
 
 /* Definitions for in6.h */
+#ifndef __UAPI_DEF_IN6_ADDR
 #define __UAPI_DEF_IN6_ADDR		1
+#endif
+#ifndef __UAPI_DEF_IN6_ADDR_ALT
 #define __UAPI_DEF_IN6_ADDR_ALT		1
+#endif
+#ifndef __UAPI_DEF_SOCKADDR_IN6
 #define __UAPI_DEF_SOCKADDR_IN6		1
+#endif
+#ifndef __UAPI_DEF_IPV6_MREQ
 #define __UAPI_DEF_IPV6_MREQ		1
+#endif
+#ifndef __UAPI_DEF_IPPROTO_V6
 #define __UAPI_DEF_IPPROTO_V6		1
+#endif
+#ifndef __UAPI_DEF_IPV6_OPTIONS
 #define __UAPI_DEF_IPV6_OPTIONS		1
+#endif
+#ifndef __UAPI_DEF_IN6_PKTINFO
 #define __UAPI_DEF_IN6_PKTINFO		1
+#endif
+#ifndef __UAPI_DEF_IP6_MTUINFO
 #define __UAPI_DEF_IP6_MTUINFO		1
+#endif
 
 /* Definitions for ipx.h */
+#ifndef __UAPI_DEF_SOCKADDR_IPX
 #define __UAPI_DEF_SOCKADDR_IPX			1
+#endif
+#ifndef __UAPI_DEF_IPX_ROUTE_DEFINITION
 #define __UAPI_DEF_IPX_ROUTE_DEFINITION		1
+#endif
+#ifndef __UAPI_DEF_IPX_INTERFACE_DEFINITION
 #define __UAPI_DEF_IPX_INTERFACE_DEFINITION	1
+#endif
+#ifndef __UAPI_DEF_IPX_CONFIG_DATA
 #define __UAPI_DEF_IPX_CONFIG_DATA		1
+#endif
+#ifndef __UAPI_DEF_IPX_ROUTE_DEF
 #define __UAPI_DEF_IPX_ROUTE_DEF		1
+#endif
 
 /* Definitions for xattr.h */
+#ifndef __UAPI_DEF_XATTR
 #define __UAPI_DEF_XATTR		1
+#endif
 
 #endif /* __GLIBC__ */
 

From c095508770aebf1b9218e77026e48345d719b17c Mon Sep 17 00:00:00 2001
From: Mohamed Ghannam <simo.ghannam@gmail.com>
Date: Tue, 2 Jan 2018 19:44:34 +0000
Subject: [PATCH 237/305] RDS: Heap OOB write in rds_message_alloc_sgs()

When args->nr_local is 0, nr_pages gets also 0 due some size
calculation via rds_rm_size(), which is later used to allocate
pages for DMA, this bug produces a heap Out-Of-Bound write access
to a specific memory region.

Signed-off-by: Mohamed Ghannam <simo.ghannam@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/rdma.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index bc2f1e0977d6..94729d9da437 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -525,6 +525,9 @@ int rds_rdma_extra_size(struct rds_rdma_args *args)
 
 	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
 
+	if (args->nr_local == 0)
+		return -EINVAL;
+
 	/* figure out the number of pages in the vector */
 	for (i = 0; i < args->nr_local; i++) {
 		if (copy_from_user(&vec, &local_vec[i],

From 79d0895140e937ba111e6420b4cd83ee75efa788 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 2 Jan 2018 19:44:37 -0200
Subject: [PATCH 238/305] sctp: fix error path in sctp_stream_init

syzbot noticed a NULL pointer dereference panic in sctp_stream_free()
which was caused by an incomplete error handling in sctp_stream_init().
By not clearing stream->outcnt, it made a for() in sctp_stream_free()
think that it had elements to free, but not, leading to the panic.

As suggested by Xin Long, this patch also simplifies the error path by
moving it to the only if() that uses it.

See-also: https://www.spinics.net/lists/netdev/msg473756.html
See-also: https://www.spinics.net/lists/netdev/msg465024.html
Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: f952be79cebd ("sctp: introduce struct sctp_stream_out_ext")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 76ea66be0bbe..524dfeb94c41 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -156,9 +156,9 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 	sctp_stream_outq_migrate(stream, NULL, outcnt);
 	sched->sched_all(stream);
 
-	i = sctp_stream_alloc_out(stream, outcnt, gfp);
-	if (i)
-		return i;
+	ret = sctp_stream_alloc_out(stream, outcnt, gfp);
+	if (ret)
+		goto out;
 
 	stream->outcnt = outcnt;
 	for (i = 0; i < stream->outcnt; i++)
@@ -170,19 +170,17 @@ in:
 	if (!incnt)
 		goto out;
 
-	i = sctp_stream_alloc_in(stream, incnt, gfp);
-	if (i) {
-		ret = -ENOMEM;
-		goto free;
+	ret = sctp_stream_alloc_in(stream, incnt, gfp);
+	if (ret) {
+		sched->free(stream);
+		kfree(stream->out);
+		stream->out = NULL;
+		stream->outcnt = 0;
+		goto out;
 	}
 
 	stream->incnt = incnt;
-	goto out;
 
-free:
-	sched->free(stream);
-	kfree(stream->out);
-	stream->out = NULL;
 out:
 	return ret;
 }

From f1c8d3720f2e6c8c2b209120678236debd0360e5 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Tue, 2 Jan 2018 14:05:19 -0800
Subject: [PATCH 239/305] vxlan: trivial indenting fix.

Fix indentation of reserved_flags2 field in vxlanhdr_gpe.

Fixes: e1e5314de08b ("vxlan: implement GPE")
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 13223396dc64..f96391e84a8a 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -146,7 +146,7 @@ struct vxlanhdr_gpe {
 		np_applied:1,
 		instance_applied:1,
 		version:2,
-reserved_flags2:2;
+		reserved_flags2:2;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	u8	reserved_flags2:2,
 		version:2,

From 64e711ca59ef9b7873d77ef06bc174aa01af9115 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 17 Nov 2017 15:51:47 -0800
Subject: [PATCH 240/305] i40e: Remove UDP support for big buffer

Since UDP based filters are not supported via big buffer cloud
filters, remove UDP support.  Also change a few return types to
indicate unsupported vs invalid configuration.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 321d8be80871..fffd4868defb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6038,8 +6038,8 @@ static int i40e_validate_and_set_switch_mode(struct i40e_vsi *vsi)
 	/* Set Bit 7 to be valid */
 	mode = I40E_AQ_SET_SWITCH_BIT7_VALID;
 
-	/* Set L4type to both TCP and UDP support */
-	mode |= I40E_AQ_SET_SWITCH_L4_TYPE_BOTH;
+	/* Set L4type for TCP support */
+	mode |= I40E_AQ_SET_SWITCH_L4_TYPE_TCP;
 
 	/* Set cloud filter mode */
 	mode |= I40E_AQ_SET_SWITCH_MODE_NON_TUNNEL;
@@ -6969,18 +6969,18 @@ static int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi,
 	     is_valid_ether_addr(filter->src_mac)) ||
 	    (is_multicast_ether_addr(filter->dst_mac) &&
 	     is_multicast_ether_addr(filter->src_mac)))
-		return -EINVAL;
+		return -EOPNOTSUPP;
 
-	/* Make sure port is specified, otherwise bail out, for channel
-	 * specific cloud filter needs 'L4 port' to be non-zero
+	/* Big buffer cloud filter needs 'L4 port' to be non-zero. Also, UDP
+	 * ports are not supported via big buffer now.
 	 */
-	if (!filter->dst_port)
-		return -EINVAL;
+	if (!filter->dst_port || filter->ip_proto == IPPROTO_UDP)
+		return -EOPNOTSUPP;
 
 	/* adding filter using src_port/src_ip is not supported at this stage */
 	if (filter->src_port || filter->src_ipv4 ||
 	    !ipv6_addr_any(&filter->ip.v6.src_ip6))
-		return -EINVAL;
+		return -EOPNOTSUPP;
 
 	/* copy element needed to add cloud filter from filter */
 	i40e_set_cld_element(filter, &cld_filter.element);
@@ -6991,7 +6991,7 @@ static int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi,
 	    is_multicast_ether_addr(filter->src_mac)) {
 		/* MAC + IP : unsupported mode */
 		if (filter->dst_ipv4)
-			return -EINVAL;
+			return -EOPNOTSUPP;
 
 		/* since we validated that L4 port must be valid before
 		 * we get here, start with respective "flags" value

From e90f686b4358d7d7e5dbaa48b8e78c9a4e41826e Mon Sep 17 00:00:00 2001
From: Fugang Duan <fugang.duan@nxp.com>
Date: Wed, 3 Jan 2018 10:39:29 +0800
Subject: [PATCH 241/305] net: fec: restore dev_id in the cases of probe error

The static variable dev_id always plus one before netdev registerred.
It should restore the dev_id value in the cases of probe error.

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 8184d2fca9be..6a4fc2b35488 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3556,6 +3556,7 @@ failed_phy:
 	of_node_put(phy_node);
 failed_ioremap:
 	free_netdev(ndev);
+	dev_id--;
 
 	return ret;
 }

From 3f38c683033a9a0a2738e7067f449deefabfa3ef Mon Sep 17 00:00:00 2001
From: Fugang Duan <fugang.duan@nxp.com>
Date: Wed, 3 Jan 2018 10:39:30 +0800
Subject: [PATCH 242/305] net: fec: defer probe if regulator is not ready

Defer probe if regulator is not ready. E.g. some regulator is fixed
regulator controlled by i2c expander gpio, the i2c device may be probed
after the driver, then it should handle the case of defer probe error.

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 6a4fc2b35488..19f198e22e15 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3469,6 +3469,10 @@ fec_probe(struct platform_device *pdev)
 			goto failed_regulator;
 		}
 	} else {
+		if (PTR_ERR(fep->reg_phy) == -EPROBE_DEFER) {
+			ret = -EPROBE_DEFER;
+			goto failed_regulator;
+		}
 		fep->reg_phy = NULL;
 	}
 

From 248de22e638f10bd5bfc7624a357f940f66ba137 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 8 Dec 2017 10:55:04 -0800
Subject: [PATCH 243/305] i40e/i40evf: Account for frags split over multiple
 descriptors in check linearize

The original code for __i40e_chk_linearize didn't take into account the
fact that if a fragment is 16K in size or larger it has to be split over 2
descriptors and the smaller of those 2 descriptors will be on the trailing
edge of the transmit. As a result we can get into situations where we didn't
catch requests that could result in a Tx hang.

This patch takes care of that by subtracting the length of all but the
trailing edge of the stale fragment before we test for sum. By doing this
we can guarantee that we have all cases covered, including the case of a
fragment that spans multiple descriptors. We don't need to worry about
checking the inner portions of this since 12K is the maximum aligned DMA
size and that is larger than any MSS will ever be since the MTU limit for
jumbos is something on the order of 9K.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 26 ++++++++++++++++---
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 26 ++++++++++++++++---
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 4566d66ffc7c..5bc2748ac468 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3047,10 +3047,30 @@ bool __i40e_chk_linearize(struct sk_buff *skb)
 	/* Walk through fragments adding latest fragment, testing it, and
 	 * then removing stale fragments from the sum.
 	 */
-	stale = &skb_shinfo(skb)->frags[0];
-	for (;;) {
+	for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
+		int stale_size = skb_frag_size(stale);
+
 		sum += skb_frag_size(frag++);
 
+		/* The stale fragment may present us with a smaller
+		 * descriptor than the actual fragment size. To account
+		 * for that we need to remove all the data on the front and
+		 * figure out what the remainder would be in the last
+		 * descriptor associated with the fragment.
+		 */
+		if (stale_size > I40E_MAX_DATA_PER_TXD) {
+			int align_pad = -(stale->page_offset) &
+					(I40E_MAX_READ_REQ_SIZE - 1);
+
+			sum -= align_pad;
+			stale_size -= align_pad;
+
+			do {
+				sum -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+				stale_size -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+			} while (stale_size > I40E_MAX_DATA_PER_TXD);
+		}
+
 		/* if sum is negative we failed to make sufficient progress */
 		if (sum < 0)
 			return true;
@@ -3058,7 +3078,7 @@ bool __i40e_chk_linearize(struct sk_buff *skb)
 		if (!nr_frags--)
 			break;
 
-		sum -= skb_frag_size(stale++);
+		sum -= stale_size;
 	}
 
 	return false;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 50864f99446d..1ba29bb85b67 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -2012,10 +2012,30 @@ bool __i40evf_chk_linearize(struct sk_buff *skb)
 	/* Walk through fragments adding latest fragment, testing it, and
 	 * then removing stale fragments from the sum.
 	 */
-	stale = &skb_shinfo(skb)->frags[0];
-	for (;;) {
+	for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
+		int stale_size = skb_frag_size(stale);
+
 		sum += skb_frag_size(frag++);
 
+		/* The stale fragment may present us with a smaller
+		 * descriptor than the actual fragment size. To account
+		 * for that we need to remove all the data on the front and
+		 * figure out what the remainder would be in the last
+		 * descriptor associated with the fragment.
+		 */
+		if (stale_size > I40E_MAX_DATA_PER_TXD) {
+			int align_pad = -(stale->page_offset) &
+					(I40E_MAX_READ_REQ_SIZE - 1);
+
+			sum -= align_pad;
+			stale_size -= align_pad;
+
+			do {
+				sum -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+				stale_size -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+			} while (stale_size > I40E_MAX_DATA_PER_TXD);
+		}
+
 		/* if sum is negative we failed to make sufficient progress */
 		if (sum < 0)
 			return true;
@@ -2023,7 +2043,7 @@ bool __i40evf_chk_linearize(struct sk_buff *skb)
 		if (!nr_frags--)
 			break;
 
-		sum -= skb_frag_size(stale++);
+		sum -= stale_size;
 	}
 
 	return false;

From 458867b2ca0c987445c5d9adccd1642970e1ba07 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Wed, 20 Dec 2017 11:04:36 -0500
Subject: [PATCH 244/305] i40e: don't remove netdev->dev_addr when syncing uc
 list

In some circumstances, such as with bridging, it is possible that the
stack will add a devices own MAC address to its unicast address list.

If, later, the stack deletes this address, then the i40e driver will
receive a request to remove this address.

The driver stores its current MAC address as part of the MAC/VLAN hash
array, since it is convenient and matches exactly how the hardware
expects to be told which traffic to receive.

This causes a problem, since for more devices, the MAC address is stored
separately, and requests to delete a unicast address should not have the
ability to remove the filter for the MAC address.

Fix this by forcing a check on every address sync to ensure we do not
remove the device address.

There is a very narrow possibility of a race between .set_mac and
.set_rx_mode, if we don't change netdev->dev_addr before updating our
internal MAC list in .set_mac. This might be possible if .set_rx_mode is
going to remove MAC "XYZ" from the list, at the same time as .set_mac
changes our dev_addr to MAC "XYZ", we might possibly queue a delete,
then an add in .set_mac, then queue a delete in .set_rx_mode's
dev_uc_sync and then update netdev->dev_addr. We can avoid this by
moving the copy into dev_addr prior to the changes to the MAC filter
list.

A similar race on the other side does not cause problems, as if we're
changing our MAC form A to B, and we race with .set_rx_mode, it could
queue a delete from A, we'd update our address, and allow the delete.
This seems like a race, but in reality we're about to queue a delete of
A anyways, so it would not cause any issues.

A race in the initialization code is unlikely because the netdevice has
not yet been fully initialized and the stack should not be adding or
removing addresses yet.

Note that we don't (yet) need similar code for the VF driver because it
does not make use of __dev_uc_sync and __dev_mc_sync, but instead roles
its own method for handling updates to the MAC/VLAN list, which already
has code to protect against removal of the hardware address.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index fffd4868defb..9e4b78e447f8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1573,11 +1573,18 @@ static int i40e_set_mac(struct net_device *netdev, void *p)
 	else
 		netdev_info(netdev, "set new mac address %pM\n", addr->sa_data);
 
+	/* Copy the address first, so that we avoid a possible race with
+	 * .set_rx_mode(). If we copy after changing the address in the filter
+	 * list, we might open ourselves to a narrow race window where
+	 * .set_rx_mode could delete our dev_addr filter and prevent traffic
+	 * from passing.
+	 */
+	ether_addr_copy(netdev->dev_addr, addr->sa_data);
+
 	spin_lock_bh(&vsi->mac_filter_hash_lock);
 	i40e_del_mac_filter(vsi, netdev->dev_addr);
 	i40e_add_mac_filter(vsi, addr->sa_data);
 	spin_unlock_bh(&vsi->mac_filter_hash_lock);
-	ether_addr_copy(netdev->dev_addr, addr->sa_data);
 	if (vsi->type == I40E_VSI_MAIN) {
 		i40e_status ret;
 
@@ -1923,6 +1930,14 @@ static int i40e_addr_unsync(struct net_device *netdev, const u8 *addr)
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_vsi *vsi = np->vsi;
 
+	/* Under some circumstances, we might receive a request to delete
+	 * our own device address from our uc list. Because we store the
+	 * device address in the VSI's MAC/VLAN filter list, we need to ignore
+	 * such requests and not delete our device address from this list.
+	 */
+	if (ether_addr_equal(addr, netdev->dev_addr))
+		return 0;
+
 	i40e_del_mac_filter(vsi, addr);
 
 	return 0;

From bc4244c6e33f96b48c4986ce4653df4673c6a08e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 22 Dec 2017 12:45:16 +0100
Subject: [PATCH 245/305] i40e: flower: Fix return value for unsupported
 offload

When filter configuration is not supported, drivers should return
-EOPNOTSUPP so the core can react correctly.

Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 9e4b78e447f8..42dcaefc4c19 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7371,7 +7371,7 @@ static int i40e_configure_clsflower(struct i40e_vsi *vsi,
 
 	if (tc < 0) {
 		dev_err(&vsi->back->pdev->dev, "Invalid traffic class\n");
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	}
 
 	if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state) ||

From 15962a18284552b5ec58982ff60a5e92e0c5c92b Mon Sep 17 00:00:00 2001
From: Arjun Vynipadath <arjun@chelsio.com>
Date: Wed, 3 Jan 2018 11:44:07 +0530
Subject: [PATCH 246/305] cxgb4: Fix FW flash errors

commit 96ac18f14a5a ("cxgb4: Add support for new flash parts")
removed initialization of adapter->params.sf_fw_start causing issues
while flashing firmware to card. We no longer need sf_fw_start
in adapter->params as we already have macros defined for FW flash
addresses.

Fixes: 96ac18f14a5a ("cxgb4: Add support for new flash parts")
Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |  1 -
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 17 ++++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 6f9fa6e3c42a..d8424ed16c33 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -344,7 +344,6 @@ struct adapter_params {
 
 	unsigned int sf_size;             /* serial flash size in bytes */
 	unsigned int sf_nsec;             /* # of flash sectors */
-	unsigned int sf_fw_start;         /* start of FW image in flash */
 
 	unsigned int fw_vers;		  /* firmware version */
 	unsigned int bs_vers;		  /* bootstrap version */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index f63210f15579..375ef86a84da 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -2844,8 +2844,6 @@ enum {
 	SF_RD_DATA_FAST = 0xb,        /* read flash */
 	SF_RD_ID        = 0x9f,       /* read ID */
 	SF_ERASE_SECTOR = 0xd8,       /* erase sector */
-
-	FW_MAX_SIZE = 16 * SF_SEC_SIZE,
 };
 
 /**
@@ -3558,8 +3556,9 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size)
 	const __be32 *p = (const __be32 *)fw_data;
 	const struct fw_hdr *hdr = (const struct fw_hdr *)fw_data;
 	unsigned int sf_sec_size = adap->params.sf_size / adap->params.sf_nsec;
-	unsigned int fw_img_start = adap->params.sf_fw_start;
-	unsigned int fw_start_sec = fw_img_start / sf_sec_size;
+	unsigned int fw_start_sec = FLASH_FW_START_SEC;
+	unsigned int fw_size = FLASH_FW_MAX_SIZE;
+	unsigned int fw_start = FLASH_FW_START;
 
 	if (!size) {
 		dev_err(adap->pdev_dev, "FW image has no data\n");
@@ -3575,9 +3574,9 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size)
 			"FW image size differs from size in FW header\n");
 		return -EINVAL;
 	}
-	if (size > FW_MAX_SIZE) {
+	if (size > fw_size) {
 		dev_err(adap->pdev_dev, "FW image too large, max is %u bytes\n",
-			FW_MAX_SIZE);
+			fw_size);
 		return -EFBIG;
 	}
 	if (!t4_fw_matches_chip(adap, hdr))
@@ -3604,11 +3603,11 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size)
 	 */
 	memcpy(first_page, fw_data, SF_PAGE_SIZE);
 	((struct fw_hdr *)first_page)->fw_ver = cpu_to_be32(0xffffffff);
-	ret = t4_write_flash(adap, fw_img_start, SF_PAGE_SIZE, first_page);
+	ret = t4_write_flash(adap, fw_start, SF_PAGE_SIZE, first_page);
 	if (ret)
 		goto out;
 
-	addr = fw_img_start;
+	addr = fw_start;
 	for (size -= SF_PAGE_SIZE; size; size -= SF_PAGE_SIZE) {
 		addr += SF_PAGE_SIZE;
 		fw_data += SF_PAGE_SIZE;
@@ -3618,7 +3617,7 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size)
 	}
 
 	ret = t4_write_flash(adap,
-			     fw_img_start + offsetof(struct fw_hdr, fw_ver),
+			     fw_start + offsetof(struct fw_hdr, fw_ver),
 			     sizeof(hdr->fw_ver), (const u8 *)&hdr->fw_ver);
 out:
 	if (ret)

From 7853b49ce8e0ef6364d24512b287463841d71bd3 Mon Sep 17 00:00:00 2001
From: Netanel Belgazal <netanel@amazon.com>
Date: Wed, 3 Jan 2018 06:17:29 +0000
Subject: [PATCH 247/305] net: ena: unmask MSI-X only after device
 initialization is completed

Under certain conditions MSI-X interrupt might arrive right after it
was unmasked in ena_up(). There is a chance it would be processed by
the driver before device ENA_FLAG_DEV_UP flag is set. In such a case
the interrupt is ignored.
ENA device operates in auto-masked mode, therefore ignoring
interrupt leaves it masked for good.
Moving unmask of interrupt to be the last step in ena_up().

Signed-off-by: Netanel Belgazal <netanel@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 26 ++++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 97c5a89a9cf7..6fb28fd43eb3 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -1565,7 +1565,7 @@ static int ena_rss_configure(struct ena_adapter *adapter)
 
 static int ena_up_complete(struct ena_adapter *adapter)
 {
-	int rc, i;
+	int rc;
 
 	rc = ena_rss_configure(adapter);
 	if (rc)
@@ -1584,17 +1584,6 @@ static int ena_up_complete(struct ena_adapter *adapter)
 
 	ena_napi_enable_all(adapter);
 
-	/* Enable completion queues interrupt */
-	for (i = 0; i < adapter->num_queues; i++)
-		ena_unmask_interrupt(&adapter->tx_ring[i],
-				     &adapter->rx_ring[i]);
-
-	/* schedule napi in case we had pending packets
-	 * from the last time we disable napi
-	 */
-	for (i = 0; i < adapter->num_queues; i++)
-		napi_schedule(&adapter->ena_napi[i].napi);
-
 	return 0;
 }
 
@@ -1731,7 +1720,7 @@ create_err:
 
 static int ena_up(struct ena_adapter *adapter)
 {
-	int rc;
+	int rc, i;
 
 	netdev_dbg(adapter->netdev, "%s\n", __func__);
 
@@ -1774,6 +1763,17 @@ static int ena_up(struct ena_adapter *adapter)
 
 	set_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 
+	/* Enable completion queues interrupt */
+	for (i = 0; i < adapter->num_queues; i++)
+		ena_unmask_interrupt(&adapter->tx_ring[i],
+				     &adapter->rx_ring[i]);
+
+	/* schedule napi in case we had pending packets
+	 * from the last time we disable napi
+	 */
+	for (i = 0; i < adapter->num_queues; i++)
+		napi_schedule(&adapter->ena_napi[i].napi);
+
 	return rc;
 
 err_up:

From ee4552aaf3fef5345199b8a82e40be7245b289fb Mon Sep 17 00:00:00 2001
From: Netanel Belgazal <netanel@amazon.com>
Date: Wed, 3 Jan 2018 06:17:30 +0000
Subject: [PATCH 248/305] net: ena: fix error handling in ena_down() sequence

ENA admin command queue errors are not handled as part of ena_down().
As a result, in case of error admin queue transitions to non-running
state and aborts all subsequent commands including those coming from
ena_up(). Reset scheduled by the driver from the timer service
context would not proceed due to sharing rtnl with ena_up()/ena_down()

Signed-off-by: Netanel Belgazal <netanel@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 6fb28fd43eb3..fbe21a817bd8 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -75,6 +75,9 @@ static struct workqueue_struct *ena_wq;
 MODULE_DEVICE_TABLE(pci, ena_pci_tbl);
 
 static int ena_rss_init_default(struct ena_adapter *adapter);
+static void check_for_admin_com_state(struct ena_adapter *adapter);
+static void ena_destroy_device(struct ena_adapter *adapter);
+static int ena_restore_device(struct ena_adapter *adapter);
 
 static void ena_tx_timeout(struct net_device *dev)
 {
@@ -1884,6 +1887,17 @@ static int ena_close(struct net_device *netdev)
 	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
 		ena_down(adapter);
 
+	/* Check for device status and issue reset if needed*/
+	check_for_admin_com_state(adapter);
+	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		netif_err(adapter, ifdown, adapter->netdev,
+			  "Destroy failure, restarting device\n");
+		ena_dump_stats_to_dmesg(adapter);
+		/* rtnl lock already obtained in dev_ioctl() layer */
+		ena_destroy_device(adapter);
+		ena_restore_device(adapter);
+	}
+
 	return 0;
 }
 
@@ -2544,11 +2558,12 @@ static void ena_destroy_device(struct ena_adapter *adapter)
 
 	ena_com_set_admin_running_state(ena_dev, false);
 
-	ena_close(netdev);
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		ena_down(adapter);
 
 	/* Before releasing the ENA resources, a device reset is required.
 	 * (to prevent the device from accessing them).
-	 * In case the reset flag is set and the device is up, ena_close
+	 * In case the reset flag is set and the device is up, ena_down()
 	 * already perform the reset, so it can be skipped.
 	 */
 	if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up))

From e816c201aed5232171f8eb80b5d46ae6516683b9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 2 Jan 2018 15:21:33 -0800
Subject: [PATCH 249/305] exec: Weaken dumpability for secureexec

This is a logical revert of commit e37fdb785a5f ("exec: Use secureexec
for setting dumpability")

This weakens dumpability back to checking only for uid/gid changes in
current (which is useless), but userspace depends on dumpability not
being tied to secureexec.

  https://bugzilla.redhat.com/show_bug.cgi?id=1528633

Reported-by: Tom Horsley <horsley1953@gmail.com>
Fixes: e37fdb785a5f ("exec: Use secureexec for setting dumpability")
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 5688b5e1b937..7eb8d21bcab9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1349,9 +1349,14 @@ void setup_new_exec(struct linux_binprm * bprm)
 
 	current->sas_ss_sp = current->sas_ss_size = 0;
 
-	/* Figure out dumpability. */
+	/*
+	 * Figure out dumpability. Note that this checking only of current
+	 * is wrong, but userspace depends on it. This should be testing
+	 * bprm->secureexec instead.
+	 */
 	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
-	    bprm->secureexec)
+	    !(uid_eq(current_euid(), current_uid()) &&
+	      gid_eq(current_egid(), current_gid())))
 		set_dumpable(current->mm, suid_dumpable);
 	else
 		set_dumpable(current->mm, SUID_DUMP_USER);

From ee4aa8df70fa6d76bd776c025dc0d8d746c18317 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 3 Jan 2018 13:09:23 -0500
Subject: [PATCH 250/305] 3c59x: fix missing dma_mapping_error check and bad
 ring refill logic

A few spots in 3c59x missed calls to dma_mapping_error checks, casuing
WARN_ONS to trigger.  Clean those up.  While we're at it, refactor the
refill code a bit so that if skb allocation or dma mapping fails, we
recycle the existing buffer.  This prevents holes in the rx ring, and
makes for much simpler logic

Note: This is compile only tested.  Ted, if you could run this and
confirm that it continues to work properly, I would appreciate it, as I
currently don't have access to this hardware

Signed-off-by: Neil Horman <nhorman@redhat.com>
CC: Steffen Klassert <klassert@mathematik.tu-chemnitz.de>
CC: "David S. Miller" <davem@davemloft.net>
Reported-by: tedheadster@gmail.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/3com/3c59x.c | 90 +++++++++++++------------------
 1 file changed, 38 insertions(+), 52 deletions(-)

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index f4e13a7014bd..36c8950dbd2d 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -602,7 +602,7 @@ struct vortex_private {
 	struct sk_buff* rx_skbuff[RX_RING_SIZE];
 	struct sk_buff* tx_skbuff[TX_RING_SIZE];
 	unsigned int cur_rx, cur_tx;		/* The next free ring entry */
-	unsigned int dirty_rx, dirty_tx;	/* The ring entries to be free()ed. */
+	unsigned int dirty_tx;	/* The ring entries to be free()ed. */
 	struct vortex_extra_stats xstats;	/* NIC-specific extra stats */
 	struct sk_buff *tx_skb;				/* Packet being eaten by bus master ctrl.  */
 	dma_addr_t tx_skb_dma;				/* Allocated DMA address for bus master ctrl DMA.   */
@@ -618,7 +618,6 @@ struct vortex_private {
 
 	/* The remainder are related to chip state, mostly media selection. */
 	struct timer_list timer;			/* Media selection timer. */
-	struct timer_list rx_oom_timer;		/* Rx skb allocation retry timer */
 	int options;						/* User-settable misc. driver options. */
 	unsigned int media_override:4, 		/* Passed-in media type. */
 		default_media:4,				/* Read from the EEPROM/Wn3_Config. */
@@ -760,7 +759,6 @@ static void mdio_sync(struct vortex_private *vp, int bits);
 static int mdio_read(struct net_device *dev, int phy_id, int location);
 static void mdio_write(struct net_device *vp, int phy_id, int location, int value);
 static void vortex_timer(struct timer_list *t);
-static void rx_oom_timer(struct timer_list *t);
 static netdev_tx_t vortex_start_xmit(struct sk_buff *skb,
 				     struct net_device *dev);
 static netdev_tx_t boomerang_start_xmit(struct sk_buff *skb,
@@ -1601,7 +1599,6 @@ vortex_up(struct net_device *dev)
 
 	timer_setup(&vp->timer, vortex_timer, 0);
 	mod_timer(&vp->timer, RUN_AT(media_tbl[dev->if_port].wait));
-	timer_setup(&vp->rx_oom_timer, rx_oom_timer, 0);
 
 	if (vortex_debug > 1)
 		pr_debug("%s: Initial media type %s.\n",
@@ -1676,7 +1673,7 @@ vortex_up(struct net_device *dev)
 	window_write16(vp, 0x0040, 4, Wn4_NetDiag);
 
 	if (vp->full_bus_master_rx) { /* Boomerang bus master. */
-		vp->cur_rx = vp->dirty_rx = 0;
+		vp->cur_rx = 0;
 		/* Initialize the RxEarly register as recommended. */
 		iowrite16(SetRxThreshold + (1536>>2), ioaddr + EL3_CMD);
 		iowrite32(0x0020, ioaddr + PktStatus);
@@ -1729,6 +1726,7 @@ vortex_open(struct net_device *dev)
 	struct vortex_private *vp = netdev_priv(dev);
 	int i;
 	int retval;
+	dma_addr_t dma;
 
 	/* Use the now-standard shared IRQ implementation. */
 	if ((retval = request_irq(dev->irq, vp->full_bus_master_rx ?
@@ -1753,7 +1751,11 @@ vortex_open(struct net_device *dev)
 				break;			/* Bad news!  */
 
 			skb_reserve(skb, NET_IP_ALIGN);	/* Align IP on 16 byte boundaries */
-			vp->rx_ring[i].addr = cpu_to_le32(pci_map_single(VORTEX_PCI(vp), skb->data, PKT_BUF_SZ, PCI_DMA_FROMDEVICE));
+			dma = pci_map_single(VORTEX_PCI(vp), skb->data,
+					     PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+			if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma))
+				break;
+			vp->rx_ring[i].addr = cpu_to_le32(dma);
 		}
 		if (i != RX_RING_SIZE) {
 			pr_emerg("%s: no memory for rx ring\n", dev->name);
@@ -2067,6 +2069,12 @@ vortex_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		int len = (skb->len + 3) & ~3;
 		vp->tx_skb_dma = pci_map_single(VORTEX_PCI(vp), skb->data, len,
 						PCI_DMA_TODEVICE);
+		if (dma_mapping_error(&VORTEX_PCI(vp)->dev, vp->tx_skb_dma)) {
+			dev_kfree_skb_any(skb);
+			dev->stats.tx_dropped++;
+			return NETDEV_TX_OK;
+		}
+
 		spin_lock_irq(&vp->window_lock);
 		window_set(vp, 7);
 		iowrite32(vp->tx_skb_dma, ioaddr + Wn7_MasterAddr);
@@ -2593,7 +2601,7 @@ boomerang_rx(struct net_device *dev)
 	int entry = vp->cur_rx % RX_RING_SIZE;
 	void __iomem *ioaddr = vp->ioaddr;
 	int rx_status;
-	int rx_work_limit = vp->dirty_rx + RX_RING_SIZE - vp->cur_rx;
+	int rx_work_limit = RX_RING_SIZE;
 
 	if (vortex_debug > 5)
 		pr_debug("boomerang_rx(): status %4.4x\n", ioread16(ioaddr+EL3_STATUS));
@@ -2614,7 +2622,8 @@ boomerang_rx(struct net_device *dev)
 		} else {
 			/* The packet length: up to 4.5K!. */
 			int pkt_len = rx_status & 0x1fff;
-			struct sk_buff *skb;
+			struct sk_buff *skb, *newskb;
+			dma_addr_t newdma;
 			dma_addr_t dma = le32_to_cpu(vp->rx_ring[entry].addr);
 
 			if (vortex_debug > 4)
@@ -2633,9 +2642,27 @@ boomerang_rx(struct net_device *dev)
 				pci_dma_sync_single_for_device(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
 				vp->rx_copy++;
 			} else {
+				/* Pre-allocate the replacement skb.  If it or its
+				 * mapping fails then recycle the buffer thats already
+				 * in place
+				 */
+				newskb = netdev_alloc_skb_ip_align(dev, PKT_BUF_SZ);
+				if (!newskb) {
+					dev->stats.rx_dropped++;
+					goto clear_complete;
+				}
+				newdma = pci_map_single(VORTEX_PCI(vp), newskb->data,
+							PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+				if (dma_mapping_error(&VORTEX_PCI(vp)->dev, newdma)) {
+					dev->stats.rx_dropped++;
+					consume_skb(newskb);
+					goto clear_complete;
+				}
+
 				/* Pass up the skbuff already on the Rx ring. */
 				skb = vp->rx_skbuff[entry];
-				vp->rx_skbuff[entry] = NULL;
+				vp->rx_skbuff[entry] = newskb;
+				vp->rx_ring[entry].addr = cpu_to_le32(newdma);
 				skb_put(skb, pkt_len);
 				pci_unmap_single(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
 				vp->rx_nocopy++;
@@ -2653,55 +2680,15 @@ boomerang_rx(struct net_device *dev)
 			netif_rx(skb);
 			dev->stats.rx_packets++;
 		}
-		entry = (++vp->cur_rx) % RX_RING_SIZE;
-	}
-	/* Refill the Rx ring buffers. */
-	for (; vp->cur_rx - vp->dirty_rx > 0; vp->dirty_rx++) {
-		struct sk_buff *skb;
-		entry = vp->dirty_rx % RX_RING_SIZE;
-		if (vp->rx_skbuff[entry] == NULL) {
-			skb = netdev_alloc_skb_ip_align(dev, PKT_BUF_SZ);
-			if (skb == NULL) {
-				static unsigned long last_jif;
-				if (time_after(jiffies, last_jif + 10 * HZ)) {
-					pr_warn("%s: memory shortage\n",
-						dev->name);
-					last_jif = jiffies;
-				}
-				if ((vp->cur_rx - vp->dirty_rx) == RX_RING_SIZE)
-					mod_timer(&vp->rx_oom_timer, RUN_AT(HZ * 1));
-				break;			/* Bad news!  */
-			}
 
-			vp->rx_ring[entry].addr = cpu_to_le32(pci_map_single(VORTEX_PCI(vp), skb->data, PKT_BUF_SZ, PCI_DMA_FROMDEVICE));
-			vp->rx_skbuff[entry] = skb;
-		}
+clear_complete:
 		vp->rx_ring[entry].status = 0;	/* Clear complete bit. */
 		iowrite16(UpUnstall, ioaddr + EL3_CMD);
+		entry = (++vp->cur_rx) % RX_RING_SIZE;
 	}
 	return 0;
 }
 
-/*
- * If we've hit a total OOM refilling the Rx ring we poll once a second
- * for some memory.  Otherwise there is no way to restart the rx process.
- */
-static void
-rx_oom_timer(struct timer_list *t)
-{
-	struct vortex_private *vp = from_timer(vp, t, rx_oom_timer);
-	struct net_device *dev = vp->mii.dev;
-
-	spin_lock_irq(&vp->lock);
-	if ((vp->cur_rx - vp->dirty_rx) == RX_RING_SIZE)	/* This test is redundant, but makes me feel good */
-		boomerang_rx(dev);
-	if (vortex_debug > 1) {
-		pr_debug("%s: rx_oom_timer %s\n", dev->name,
-			((vp->cur_rx - vp->dirty_rx) != RX_RING_SIZE) ? "succeeded" : "retrying");
-	}
-	spin_unlock_irq(&vp->lock);
-}
-
 static void
 vortex_down(struct net_device *dev, int final_down)
 {
@@ -2711,7 +2698,6 @@ vortex_down(struct net_device *dev, int final_down)
 	netdev_reset_queue(dev);
 	netif_stop_queue(dev);
 
-	del_timer_sync(&vp->rx_oom_timer);
 	del_timer_sync(&vp->timer);
 
 	/* Turn off statistics ASAP.  We update dev->stats below. */

From d7732ba55c4b6a2da339bb12589c515830cfac2c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Jan 2018 19:52:04 +0100
Subject: [PATCH 251/305] x86/pti: Switch to kernel CR3 at early in
 entry_SYSCALL_compat()

The preparation for PTI which added CR3 switching to the entry code
misplaced the CR3 switch in entry_SYSCALL_compat().

With PTI enabled the entry code tries to access a per cpu variable after
switching to kernel GS. This fails because that variable is not mapped to
user space. This results in a double fault and in the worst case a kernel
crash.

Move the switch ahead of the access and clobber RSP which has been saved
already.

Fixes: 8a09317b895f ("x86/mm/pti: Prepare the x86/entry assembly code for entry/exit CR3 switching")
Reported-by: Lars Wendler <wendler.lars@web.de>
Reported-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>,
Cc: Dave Hansen <dave.hansen@linux.intel.com>,
Cc: Peter Zijlstra <peterz@infradead.org>,
Cc: Greg KH <gregkh@linuxfoundation.org>, ,
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>,
Cc: Juergen Gross <jgross@suse.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031949200.1957@nanos
---
 arch/x86/entry/entry_64_compat.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 40f17009ec20..98d5358e4041 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -190,8 +190,13 @@ ENTRY(entry_SYSCALL_compat)
 	/* Interrupts are off on entry. */
 	swapgs
 
-	/* Stash user ESP and switch to the kernel stack. */
+	/* Stash user ESP */
 	movl	%esp, %r8d
+
+	/* Use %rsp as scratch reg. User ESP is stashed in r8 */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+	/* Switch to the kernel stack */
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/* Construct struct pt_regs on stack */
@@ -219,12 +224,6 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 
-	/*
-	 * We just saved %rdi so it is safe to clobber.  It is not
-	 * preserved during the C calls inside TRACE_IRQS_OFF anyway.
-	 */
-	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
-
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * turned them off.

From 2fd9c41aea47f4ad071accf94b94f94f2c4d31eb Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Wed, 3 Jan 2018 12:39:52 -0800
Subject: [PATCH 252/305] x86/process: Define cpu_tss_rw in same section as
 declaration

cpu_tss_rw is declared with DECLARE_PER_CPU_PAGE_ALIGNED
but then defined with DEFINE_PER_CPU_SHARED_ALIGNED
leading to section mismatch warnings.

Use DEFINE_PER_CPU_PAGE_ALIGNED consistently. This is necessary because
it's mapped to the cpu entry area and must be page aligned.

[ tglx: Massaged changelog a bit ]

Fixes: 1a935bc3d4ea ("x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct")
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: thomas.lendacky@amd.com
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: tklauser@distanz.ch
Cc: minipli@googlemail.com
Cc: me@kylehuey.com
Cc: namit@vmware.com
Cc: luto@kernel.org
Cc: jpoimboe@redhat.com
Cc: tj@kernel.org
Cc: cl@linux.com
Cc: bp@suse.de
Cc: thgarnie@google.com
Cc: kirill.shutemov@linux.intel.com
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180103203954.183360-1-ndesaulniers@google.com
---
 arch/x86/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 517415978409..3cb2486c47e4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,7 +47,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
+__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
 	.x86_tss = {
 		/*
 		 * .sp0 is only used when entering ring 0 from a lower

From bec40c26041de61162f7be9d2ce548c756ce0f65 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 3 Jan 2018 13:39:15 -0800
Subject: [PATCH 253/305] IB/srpt: Disable RDMA access by the initiator

With the SRP protocol all RDMA operations are initiated by the target.
Since no RDMA operations are initiated by the initiator, do not grant
the initiator permission to submit RDMA reads or writes to the target.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/srpt/ib_srpt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 8a1bd354b1cc..7c4249038004 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -1013,8 +1013,7 @@ static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp)
 		return -ENOMEM;
 
 	attr->qp_state = IB_QPS_INIT;
-	attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
-	    IB_ACCESS_REMOTE_WRITE;
+	attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE;
 	attr->port_num = ch->sport->port;
 	attr->pkey_index = 0;
 

From a1ffa4670cb97ae3a4b3e8535d88be5f643f7c3b Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 3 Jan 2018 13:39:16 -0800
Subject: [PATCH 254/305] IB/srpt: Fix ACL lookup during login

Make sure that the initiator port GUID is stored in ch->ini_guid.
Note: when initiating a connection sgid and dgid members in struct
sa_path_rec represent the source and destination GIDs. When accepting
a connection however sgid represents the destination GID and dgid the
source GID.

Fixes: commit 2bce1a6d2209 ("IB/srpt: Accept GUIDs as port names")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/srpt/ib_srpt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 7c4249038004..bfa576aa9f03 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -2077,7 +2077,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id,
 		goto destroy_ib;
 	}
 
-	guid = (__be16 *)&param->primary_path->sgid.global.interface_id;
+	guid = (__be16 *)&param->primary_path->dgid.global.interface_id;
 	snprintf(ch->ini_guid, sizeof(ch->ini_guid), "%04x:%04x:%04x:%04x",
 		 be16_to_cpu(guid[0]), be16_to_cpu(guid[1]),
 		 be16_to_cpu(guid[2]), be16_to_cpu(guid[3]));

From 30414f3010aff95ffdb6bed7b9dce62cde94fdc7 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Tue, 2 Jan 2018 12:18:37 -0800
Subject: [PATCH 255/305] drm/i915: Apply Display WA #1183 on skl, kbl, and cfl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Display WA #1183 was recently added to workaround
"Failures when enabling DPLL0 with eDP link rate 2.16
or 4.32 GHz and CD clock frequency 308.57 or 617.14 MHz
(CDCLK_CTL CD Frequency Select 10b or 11b) used in this
 enabling or in previous enabling."

This workaround was designed to minimize the impact only
to save the bad case with that link rates. But HW engineers
indicated that it should be safe to apply broadly, although
they were expecting the DPLL0 link rate to be unchanged on
runtime.

We need to cover 2 cases: when we are in fact enabling DPLL0
and when we are just changing the frequency with small
differences.

This is based on previous patch by Rodrigo Vivi with suggestions
from Ville Syrjälä.

Cc: Arthur J Runyan <arthur.j.runyan@intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171204232210.4958-1-lucas.demarchi@intel.com
(cherry picked from commit 53421c2fe99ce16838639ad89d772d914a119a49)
[ Lucas: Backport to 4.15 adding back variable that has been removed on
  commits not meant to be backported ]
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180102201837.6812-1-lucas.demarchi@intel.com
---
 drivers/gpu/drm/i915/i915_reg.h         |  2 ++
 drivers/gpu/drm/i915/intel_cdclk.c      | 35 ++++++++++++++++++-------
 drivers/gpu/drm/i915/intel_runtime_pm.c | 10 +++++++
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 3866c49bc390..333f40bc03bb 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -6977,6 +6977,7 @@ enum {
 #define  RESET_PCH_HANDSHAKE_ENABLE	(1<<4)
 
 #define GEN8_CHICKEN_DCPR_1		_MMIO(0x46430)
+#define   SKL_SELECT_ALTERNATE_DC_EXIT	(1<<30)
 #define   MASK_WAKEMEM			(1<<13)
 
 #define SKL_DFSM			_MMIO(0x51000)
@@ -8522,6 +8523,7 @@ enum skl_power_gate {
 #define  BXT_CDCLK_CD2X_DIV_SEL_2	(2<<22)
 #define  BXT_CDCLK_CD2X_DIV_SEL_4	(3<<22)
 #define  BXT_CDCLK_CD2X_PIPE(pipe)	((pipe)<<20)
+#define  CDCLK_DIVMUX_CD_OVERRIDE	(1<<19)
 #define  BXT_CDCLK_CD2X_PIPE_NONE	BXT_CDCLK_CD2X_PIPE(3)
 #define  BXT_CDCLK_SSA_PRECHARGE_ENABLE	(1<<16)
 #define  CDCLK_FREQ_DECIMAL_MASK	(0x7ff)
diff --git a/drivers/gpu/drm/i915/intel_cdclk.c b/drivers/gpu/drm/i915/intel_cdclk.c
index b2a6d62b71c0..60cf4e58389a 100644
--- a/drivers/gpu/drm/i915/intel_cdclk.c
+++ b/drivers/gpu/drm/i915/intel_cdclk.c
@@ -860,16 +860,10 @@ static void skl_set_preferred_cdclk_vco(struct drm_i915_private *dev_priv,
 
 static void skl_dpll0_enable(struct drm_i915_private *dev_priv, int vco)
 {
-	int min_cdclk = skl_calc_cdclk(0, vco);
 	u32 val;
 
 	WARN_ON(vco != 8100000 && vco != 8640000);
 
-	/* select the minimum CDCLK before enabling DPLL 0 */
-	val = CDCLK_FREQ_337_308 | skl_cdclk_decimal(min_cdclk);
-	I915_WRITE(CDCLK_CTL, val);
-	POSTING_READ(CDCLK_CTL);
-
 	/*
 	 * We always enable DPLL0 with the lowest link rate possible, but still
 	 * taking into account the VCO required to operate the eDP panel at the
@@ -923,7 +917,7 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
 {
 	int cdclk = cdclk_state->cdclk;
 	int vco = cdclk_state->vco;
-	u32 freq_select, pcu_ack;
+	u32 freq_select, pcu_ack, cdclk_ctl;
 	int ret;
 
 	WARN_ON((cdclk == 24000) != (vco == 0));
@@ -940,7 +934,7 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
 		return;
 	}
 
-	/* set CDCLK_CTL */
+	/* Choose frequency for this cdclk */
 	switch (cdclk) {
 	case 450000:
 	case 432000:
@@ -968,10 +962,33 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
 	    dev_priv->cdclk.hw.vco != vco)
 		skl_dpll0_disable(dev_priv);
 
+	cdclk_ctl = I915_READ(CDCLK_CTL);
+
+	if (dev_priv->cdclk.hw.vco != vco) {
+		/* Wa Display #1183: skl,kbl,cfl */
+		cdclk_ctl &= ~(CDCLK_FREQ_SEL_MASK | CDCLK_FREQ_DECIMAL_MASK);
+		cdclk_ctl |= freq_select | skl_cdclk_decimal(cdclk);
+		I915_WRITE(CDCLK_CTL, cdclk_ctl);
+	}
+
+	/* Wa Display #1183: skl,kbl,cfl */
+	cdclk_ctl |= CDCLK_DIVMUX_CD_OVERRIDE;
+	I915_WRITE(CDCLK_CTL, cdclk_ctl);
+	POSTING_READ(CDCLK_CTL);
+
 	if (dev_priv->cdclk.hw.vco != vco)
 		skl_dpll0_enable(dev_priv, vco);
 
-	I915_WRITE(CDCLK_CTL, freq_select | skl_cdclk_decimal(cdclk));
+	/* Wa Display #1183: skl,kbl,cfl */
+	cdclk_ctl &= ~(CDCLK_FREQ_SEL_MASK | CDCLK_FREQ_DECIMAL_MASK);
+	I915_WRITE(CDCLK_CTL, cdclk_ctl);
+
+	cdclk_ctl |= freq_select | skl_cdclk_decimal(cdclk);
+	I915_WRITE(CDCLK_CTL, cdclk_ctl);
+
+	/* Wa Display #1183: skl,kbl,cfl */
+	cdclk_ctl &= ~CDCLK_DIVMUX_CD_OVERRIDE;
+	I915_WRITE(CDCLK_CTL, cdclk_ctl);
 	POSTING_READ(CDCLK_CTL);
 
 	/* inform PCU of the change */
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 9bf46ab211cb..7e115f3927f6 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -598,6 +598,11 @@ void gen9_enable_dc5(struct drm_i915_private *dev_priv)
 
 	DRM_DEBUG_KMS("Enabling DC5\n");
 
+	/* Wa Display #1183: skl,kbl,cfl */
+	if (IS_GEN9_BC(dev_priv))
+		I915_WRITE(GEN8_CHICKEN_DCPR_1, I915_READ(GEN8_CHICKEN_DCPR_1) |
+			   SKL_SELECT_ALTERNATE_DC_EXIT);
+
 	gen9_set_dc_state(dev_priv, DC_STATE_EN_UPTO_DC5);
 }
 
@@ -625,6 +630,11 @@ void skl_disable_dc6(struct drm_i915_private *dev_priv)
 {
 	DRM_DEBUG_KMS("Disabling DC6\n");
 
+	/* Wa Display #1183: skl,kbl,cfl */
+	if (IS_GEN9_BC(dev_priv))
+		I915_WRITE(GEN8_CHICKEN_DCPR_1, I915_READ(GEN8_CHICKEN_DCPR_1) |
+			   SKL_SELECT_ALTERNATE_DC_EXIT);
+
 	gen9_set_dc_state(dev_priv, DC_STATE_DISABLE);
 }
 

From 3ea15452ee85754f70f3b9fa1f23165ef2e77ba7 Mon Sep 17 00:00:00 2001
From: Hao Chen <flank3rsky@gmail.com>
Date: Wed, 3 Jan 2018 11:00:31 +0800
Subject: [PATCH 256/305] nl80211: Check for the required netlink attribute
 presence

nl80211_nan_add_func() does not check if the required attribute
NL80211_NAN_FUNC_FOLLOW_UP_DEST is present when processing
NL80211_CMD_ADD_NAN_FUNCTION request. This request can be issued
by users with CAP_NET_ADMIN privilege and may result in NULL dereference
and a system crash. Add a check for the required attribute presence.

Signed-off-by: Hao Chen <flank3rsky@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 213d0c498c97..2b3dbcd40e46 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -11361,7 +11361,8 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
 		break;
 	case NL80211_NAN_FUNC_FOLLOW_UP:
 		if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] ||
-		    !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]) {
+		    !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] ||
+		    !tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]) {
 			err = -EINVAL;
 			goto out;
 		}

From 736a80bbfda709fb3631f5f62056f250a38e5804 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Jan 2018 15:51:53 +0100
Subject: [PATCH 257/305] mac80211: mesh: drop frames appearing to be from us

If there are multiple mesh stations with the same MAC address,
they will both get confused and start throwing warnings.

Obviously in this case nothing can actually work anyway, so just
drop frames that look like they're from ourselves early on.

Reported-by: Gui Iribarren <gui@altermundi.net>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 70e9d2ca8bbe..4daafb07602f 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3632,6 +3632,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
 		}
 		return true;
 	case NL80211_IFTYPE_MESH_POINT:
+		if (ether_addr_equal(sdata->vif.addr, hdr->addr2))
+			return false;
 		if (multicast)
 			return true;
 		return ether_addr_equal(sdata->vif.addr, hdr->addr1);

From 54e98b5d663fcd8e3279c2391537b1a1f7bfe344 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 3 Jan 2018 22:02:29 -0800
Subject: [PATCH 258/305] net: dsa: b53: Turn off Broadcom tags for more
 switches

Models such as BCM5395/97/98 and BCM53125/24/53115 and compatible require that
we turn on managed mode to actually act on Broadcom tags, otherwise they just
pass them through on ingress (host -> switch) and don't insert them in egress
(switch -> host). Turning on managed mode is simple, but requires us to
properly support ARL misses on multicast addresses which is a much more
involved set of changes not suitable for a bug fix for this release.

Reported-by: Jochen Friedrich <jochen@scram.de>
Fixes: 7edc58d614d4 ("net: dsa: b53: Turn on Broadcom tags")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index f5a8dd96fd75..4498ab897d94 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1500,10 +1500,13 @@ static enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds,
 {
 	struct b53_device *dev = ds->priv;
 
-	/* Older models support a different tag format that we do not
-	 * support in net/dsa/tag_brcm.c yet.
+	/* Older models (5325, 5365) support a different tag format that we do
+	 * not support in net/dsa/tag_brcm.c yet. 539x and 531x5 require managed
+	 * mode to be turned on which means we need to specifically manage ARL
+	 * misses on multicast addresses (TBD).
 	 */
-	if (is5325(dev) || is5365(dev) || !b53_can_enable_brcm_tags(ds, port))
+	if (is5325(dev) || is5365(dev) || is539x(dev) || is531x5(dev) ||
+	    !b53_can_enable_brcm_tags(ds, port))
 		return DSA_TAG_PROTO_NONE;
 
 	/* Broadcom BCM58xx chips have a flow accelerator on Port 8

From b4c2951a4833e66f1bbfe65ddcd4fdcdfafe5e8f Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Sat, 2 Dec 2017 18:48:52 +0100
Subject: [PATCH 259/305] can: vxcan: improve handling of missing peer name
 attribute

Picking up the patch from Serhey Popovych (commit 191cdb3822e5df6b3c8,
"veth: Be more robust on network device creation when no attributes").

When the peer name attribute is not provided the former implementation tries
to register the given device name twice ... which leads to -EEXIST.
If only one device name is given apply an automatic generated and valid name
for the peer.

Cc: Serhey Popovych <serhe.popovych@gmail.com>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: linux-stable <stable@vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/vxcan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index 8404e8852a0f..b4c4a2c76437 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -194,7 +194,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
 		tbp = peer_tb;
 	}
 
-	if (tbp[IFLA_IFNAME]) {
+	if (ifmp && tbp[IFLA_IFNAME]) {
 		nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
 		name_assign_type = NET_NAME_USER;
 	} else {

From d5b42e6607661b198d8b26a0c30969605b1bf5c7 Mon Sep 17 00:00:00 2001
From: Wolfgang Grandegger <wg@grandegger.com>
Date: Wed, 13 Dec 2017 19:52:23 +0100
Subject: [PATCH 260/305] can: gs_usb: fix return value of the "set_bittiming"
 callback

The "set_bittiming" callback treats a positive return value as error!
For that reason "can_changelink()" will quit silently after setting
the bittiming values without processing ctrlmode, restart-ms, etc.

Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Cc: linux-stable <stable@vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/usb/gs_usb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 68ac3e88a8ce..8bf80ad9dc44 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -449,7 +449,7 @@ static int gs_usb_set_bittiming(struct net_device *netdev)
 		dev_err(netdev->dev.parent, "Couldn't set bittimings (err=%d)",
 			rc);
 
-	return rc;
+	return (rc > 0) ? 0 : rc;
 }
 
 static void gs_usb_xmit_callback(struct urb *urb)

From 13454c14550065fcc1705d6bd4ee6d40e057099f Mon Sep 17 00:00:00 2001
From: Luu An Phu <phu.luuan@nxp.com>
Date: Tue, 2 Jan 2018 10:44:18 +0700
Subject: [PATCH 261/305] can: flex_can: Correct the checking for frame length
 in flexcan_start_xmit()

The flexcan_start_xmit() function compares the frame length with data
register length to write frame content into data[0] and data[1]
register. Data register length is 4 bytes and frame maximum length is 8
bytes.

Fix the check that compares frame length with 3. Because the register
length is 4.

Signed-off-by: Luu An Phu <phu.luuan@nxp.com>
Reviewed-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/flexcan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 0626dcfd1f3d..760d2c07e3a2 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -526,7 +526,7 @@ static int flexcan_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		data = be32_to_cpup((__be32 *)&cf->data[0]);
 		flexcan_write(data, &priv->tx_mb->data[0]);
 	}
-	if (cf->can_dlc > 3) {
+	if (cf->can_dlc > 4) {
 		data = be32_to_cpup((__be32 *)&cf->data[4]);
 		flexcan_write(data, &priv->tx_mb->data[1]);
 	}

From 6ebc5e8fe85286c7392f1777a3dba9e1fd6d0253 Mon Sep 17 00:00:00 2001
From: Martin Lederhilger <m.lederhilger@ds-automotion.com>
Date: Thu, 21 Dec 2017 14:42:44 +0000
Subject: [PATCH 262/305] can: ems_usb: improve error reporting for error
 warning and error passive

This patch adds the missing CAN_ERR_CRTL to cf->can_id in case of
CAN_STATE_ERROR_WARNING or CAN_STATE_ERROR_PASSIVE

Signed-off-by: Martin Lederhilger <m.lederhilger@ds-automotion.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/usb/ems_usb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
index b00358297424..12ff0020ecd6 100644
--- a/drivers/net/can/usb/ems_usb.c
+++ b/drivers/net/can/usb/ems_usb.c
@@ -395,6 +395,7 @@ static void ems_usb_rx_err(struct ems_usb *dev, struct ems_cpc_msg *msg)
 
 		if (dev->can.state == CAN_STATE_ERROR_WARNING ||
 		    dev->can.state == CAN_STATE_ERROR_PASSIVE) {
+			cf->can_id |= CAN_ERR_CRTL;
 			cf->data[1] = (txerr > rxerr) ?
 			    CAN_ERR_CRTL_TX_PASSIVE : CAN_ERR_CRTL_RX_PASSIVE;
 		}

From abb62c46d4949d44979fa647740feff3f7538799 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 29 Dec 2017 21:15:54 +0900
Subject: [PATCH 263/305] arm64: dts: uniphier: fix gpio-ranges property of
 PXs3 SoC

This is probably a copy-paste mistake.  The gpio-ranges of PXs3 is
different from that of LD20.

Fixes: 277b51e7050f ("arm64: dts: uniphier: add GPIO controller nodes")
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
index 48e733136db4..0ac2ace82435 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
+++ b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
@@ -198,8 +198,8 @@
 			gpio-controller;
 			#gpio-cells = <2>;
 			gpio-ranges = <&pinctrl 0 0 0>,
-				      <&pinctrl 96 0 0>,
-				      <&pinctrl 160 0 0>;
+				      <&pinctrl 104 0 0>,
+				      <&pinctrl 168 0 0>;
 			gpio-ranges-group-names = "gpio_range0",
 						  "gpio_range1",
 						  "gpio_range2";

From fb32dd3abf7a8fc13271d0d1c45ffc66df28dd15 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@ovn.org>
Date: Tue, 2 Jan 2018 20:14:42 -0800
Subject: [PATCH 264/305] MAINTAINERS: Update my email address.

Signed-off-by: Pravin Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index a6e86e20761e..1e6872b4c6e2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10137,7 +10137,7 @@ F:	drivers/irqchip/irq-ompic.c
 F:	drivers/irqchip/irq-or1k-*
 
 OPENVSWITCH
-M:	Pravin Shelar <pshelar@nicira.com>
+M:	Pravin B Shelar <pshelar@ovn.org>
 L:	netdev@vger.kernel.org
 L:	dev@openvswitch.org
 W:	http://openvswitch.org

From f428fe4a04cc339166c8bbd489789760de3a0cee Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@openvz.org>
Date: Tue, 2 Jan 2018 23:27:33 -0800
Subject: [PATCH 265/305] rtnetlink: give a user socket to get_target_net()

This function is used from two places: rtnl_dump_ifinfo and
rtnl_getlink. In rtnl_getlink(), we give a request skb into
get_target_net(), but in rtnl_dump_ifinfo, we give a response skb
into get_target_net().
The problem here is that NETLINK_CB() isn't initialized for the response
skb. In both cases we can get a user socket and give it instead of skb
into get_target_net().

This bug was found by syzkaller with this call-trace:

kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Modules linked in:
CPU: 1 PID: 3149 Comm: syzkaller140561 Not tainted 4.15.0-rc4-mm1+ #47
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:__netlink_ns_capable+0x8b/0x120 net/netlink/af_netlink.c:868
RSP: 0018:ffff8801c880f348 EFLAGS: 00010206
RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff8443f900
RDX: 000000000000007b RSI: ffffffff86510f40 RDI: 00000000000003d8
RBP: ffff8801c880f360 R08: 0000000000000000 R09: 1ffff10039101e4f
R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff86510f40
R13: 000000000000000c R14: 0000000000000004 R15: 0000000000000011
FS:  0000000001a1a880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020151000 CR3: 00000001c9511005 CR4: 00000000001606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  netlink_ns_capable+0x26/0x30 net/netlink/af_netlink.c:886
  get_target_net+0x9d/0x120 net/core/rtnetlink.c:1765
  rtnl_dump_ifinfo+0x2e5/0xee0 net/core/rtnetlink.c:1806
  netlink_dump+0x48c/0xce0 net/netlink/af_netlink.c:2222
  __netlink_dump_start+0x4f0/0x6d0 net/netlink/af_netlink.c:2319
  netlink_dump_start include/linux/netlink.h:214 [inline]
  rtnetlink_rcv_msg+0x7f0/0xb10 net/core/rtnetlink.c:4485
  netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2441
  rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4540
  netlink_unicast_kernel net/netlink/af_netlink.c:1308 [inline]
  netlink_unicast+0x4be/0x6a0 net/netlink/af_netlink.c:1334
  netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1897

Cc: Jiri Benc <jbenc@redhat.com>
Fixes: 79e1ad148c84 ("rtnetlink: use netnsid to query interface")
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..778d7f03404a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1681,18 +1681,18 @@ static bool link_dump_filtered(struct net_device *dev,
 	return false;
 }
 
-static struct net *get_target_net(struct sk_buff *skb, int netnsid)
+static struct net *get_target_net(struct sock *sk, int netnsid)
 {
 	struct net *net;
 
-	net = get_net_ns_by_id(sock_net(skb->sk), netnsid);
+	net = get_net_ns_by_id(sock_net(sk), netnsid);
 	if (!net)
 		return ERR_PTR(-EINVAL);
 
 	/* For now, the caller is required to have CAP_NET_ADMIN in
 	 * the user namespace owning the target net ns.
 	 */
-	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+	if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) {
 		put_net(net);
 		return ERR_PTR(-EACCES);
 	}
@@ -1733,7 +1733,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 			ifla_policy, NULL) >= 0) {
 		if (tb[IFLA_IF_NETNSID]) {
 			netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-			tgt_net = get_target_net(skb, netnsid);
+			tgt_net = get_target_net(skb->sk, netnsid);
 			if (IS_ERR(tgt_net)) {
 				tgt_net = net;
 				netnsid = -1;
@@ -2883,7 +2883,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (tb[IFLA_IF_NETNSID]) {
 		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-		tgt_net = get_target_net(skb, netnsid);
+		tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
 		if (IS_ERR(tgt_net))
 			return PTR_ERR(tgt_net);
 	}

From 879626e3a52630316d817cbda7cec9a5446d1d82 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Wed, 3 Jan 2018 16:46:29 +0100
Subject: [PATCH 266/305] net: stmmac: enable EEE in MII, GMII or RGMII only

Note in the databook - Section 4.4 - EEE :
" The EEE feature is not supported when the MAC is configured to use the
TBI, RTBI, SMII, RMII or SGMII single PHY interface. Even if the MAC
supports multiple PHY interfaces, you should activate the EEE mode only
when the MAC is operating with GMII, MII, or RGMII interface."

Applying this restriction solves a stability issue observed on Amlogic
gxl platforms operating with RMII interface and the internal PHY.

Fixes: 83bf79b6bb64 ("stmmac: disable at run-time the EEE if not supported")
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Tested-by: Arnaud Patard <arnaud.patard@rtp-net.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 337d53d12e94..c0af0bc4e714 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -364,9 +364,15 @@ static void stmmac_eee_ctrl_timer(struct timer_list *t)
 bool stmmac_eee_init(struct stmmac_priv *priv)
 {
 	struct net_device *ndev = priv->dev;
+	int interface = priv->plat->interface;
 	unsigned long flags;
 	bool ret = false;
 
+	if ((interface != PHY_INTERFACE_MODE_MII) &&
+	    (interface != PHY_INTERFACE_MODE_GMII) &&
+	    !phy_interface_mode_is_rgmii(interface))
+		goto out;
+
 	/* Using PCS we cannot dial with the phy registers at this stage
 	 * so we do not support extra feature like EEE.
 	 */

From dfe8266b8dd10e12a731c985b725fcf7f0e537f0 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Wed, 3 Jan 2018 20:09:49 +0300
Subject: [PATCH 267/305] sh_eth: fix TSU resource handling

When switching  the driver to the managed device API,  I managed to break
the  case of a  dual Ether devices sharing a single TSU: the 2nd Ether port
wouldn't probe. Iwamatsu-san has tried to fix this but his patch was buggy
and he then dropped the ball...

The solution is to  limit calling devm_request_mem_region() to the first
of  the two  ports  sharing the same TSU, so devm_ioremap_resource() can't
be used anymore for the TSU resource...

Fixes: d5e07e69218f ("sh_eth: use managed device API")
Reported-by: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 75323000c364..1bdd67a8a869 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3225,10 +3225,29 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
 	/* ioremap the TSU registers */
 	if (mdp->cd->tsu) {
 		struct resource *rtsu;
+
 		rtsu = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-		mdp->tsu_addr = devm_ioremap_resource(&pdev->dev, rtsu);
-		if (IS_ERR(mdp->tsu_addr)) {
-			ret = PTR_ERR(mdp->tsu_addr);
+		if (!rtsu) {
+			dev_err(&pdev->dev, "no TSU resource\n");
+			ret = -ENODEV;
+			goto out_release;
+		}
+		/* We can only request the  TSU region  for the first port
+		 * of the two  sharing this TSU for the probe to succeed...
+		 */
+		if (devno % 2 == 0 &&
+		    !devm_request_mem_region(&pdev->dev, rtsu->start,
+					     resource_size(rtsu),
+					     dev_name(&pdev->dev))) {
+			dev_err(&pdev->dev, "can't request TSU resource.\n");
+			ret = -EBUSY;
+			goto out_release;
+		}
+		mdp->tsu_addr = devm_ioremap(&pdev->dev, rtsu->start,
+					     resource_size(rtsu));
+		if (!mdp->tsu_addr) {
+			dev_err(&pdev->dev, "TSU region ioremap() failed.\n");
+			ret = -ENOMEM;
 			goto out_release;
 		}
 		mdp->port = devno % 2;

From 7d11f77f84b27cef452cee332f4e469503084737 Mon Sep 17 00:00:00 2001
From: Mohamed Ghannam <simo.ghannam@gmail.com>
Date: Wed, 3 Jan 2018 21:06:06 +0000
Subject: [PATCH 268/305] RDS: null pointer dereference in rds_atomic_free_op

set rm->atomic.op_active to 0 when rds_pin_pages() fails
or the user supplied address is invalid,
this prevents a NULL pointer usage in rds_atomic_free_op()

Signed-off-by: Mohamed Ghannam <simo.ghannam@gmail.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/rdma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 94729d9da437..634cfcb7bba6 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -877,6 +877,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
 err:
 	if (page)
 		put_page(page);
+	rm->atomic.op_active = 0;
 	kfree(rm->atomic.op_notifier);
 
 	return ret;

From 7bbfe00e025240505db3e04c3b296d7c023b2a26 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Wed, 3 Jan 2018 14:11:59 -0800
Subject: [PATCH 269/305] ipv6: fix general protection fault in fib6_add()

In fib6_add(), pn could be NULL if fib6_add_1() failed to return a fib6
node. Checking pn != fn before accessing pn->leaf makes sure pn is not
NULL.
This fixes the following GPF reported by syzkaller:
general protection fault: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 3201 Comm: syzkaller001778 Not tainted 4.15.0-rc5+ #151
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:fib6_add+0x736/0x15a0 net/ipv6/ip6_fib.c:1244
RSP: 0018:ffff8801c7626a70 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: 0000000000000020 RCX: ffffffff84794465
RDX: 0000000000000004 RSI: ffff8801d38935f0 RDI: 0000000000000282
RBP: ffff8801c7626da0 R08: 1ffff10038ec4c35 R09: 0000000000000000
R10: ffff8801c7626c68 R11: 0000000000000000 R12: 00000000fffffffe
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000009
FS:  0000000000000000(0000) GS:ffff8801db200000(0063) knlGS:0000000009b70840
CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
CR2: 0000000020be1000 CR3: 00000001d585a006 CR4: 00000000001606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1006
 ip6_route_multipath_add+0xd14/0x16c0 net/ipv6/route.c:3833
 inet6_rtm_newroute+0xdc/0x160 net/ipv6/route.c:3957
 rtnetlink_rcv_msg+0x733/0x1020 net/core/rtnetlink.c:4411
 netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2408
 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4423
 netlink_unicast_kernel net/netlink/af_netlink.c:1275 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1301
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1864
 sock_sendmsg_nosec net/socket.c:636 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:646
 sock_write_iter+0x31a/0x5d0 net/socket.c:915
 call_write_iter include/linux/fs.h:1772 [inline]
 do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653
 do_iter_write+0x154/0x540 fs/read_write.c:932
 compat_writev+0x225/0x420 fs/read_write.c:1246
 do_compat_writev+0x115/0x220 fs/read_write.c:1267
 C_SYSC_writev fs/read_write.c:1278 [inline]
 compat_SyS_writev+0x26/0x30 fs/read_write.c:1274
 do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline]
 do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389
 entry_SYSENTER_compat+0x54/0x63 arch/x86/entry/entry_64_compat.S:125

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f5285f4e1d08..d11a5578e4f8 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1241,23 +1241,28 @@ out:
 		 * If fib6_add_1 has cleared the old leaf pointer in the
 		 * super-tree leaf node we have to find a new one for it.
 		 */
-		struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
-					    lockdep_is_held(&table->tb6_lock));
-		if (pn != fn && pn_leaf == rt) {
-			pn_leaf = NULL;
-			RCU_INIT_POINTER(pn->leaf, NULL);
-			atomic_dec(&rt->rt6i_ref);
-		}
-		if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
-			pn_leaf = fib6_find_prefix(info->nl_net, table, pn);
-#if RT6_DEBUG >= 2
-			if (!pn_leaf) {
-				WARN_ON(!pn_leaf);
-				pn_leaf = info->nl_net->ipv6.ip6_null_entry;
+		if (pn != fn) {
+			struct rt6_info *pn_leaf =
+				rcu_dereference_protected(pn->leaf,
+				    lockdep_is_held(&table->tb6_lock));
+			if (pn_leaf == rt) {
+				pn_leaf = NULL;
+				RCU_INIT_POINTER(pn->leaf, NULL);
+				atomic_dec(&rt->rt6i_ref);
 			}
+			if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
+				pn_leaf = fib6_find_prefix(info->nl_net, table,
+							   pn);
+#if RT6_DEBUG >= 2
+				if (!pn_leaf) {
+					WARN_ON(!pn_leaf);
+					pn_leaf =
+					    info->nl_net->ipv6.ip6_null_entry;
+				}
 #endif
-			atomic_inc(&pn_leaf->rt6i_ref);
-			rcu_assign_pointer(pn->leaf, pn_leaf);
+				atomic_inc(&pn_leaf->rt6i_ref);
+				rcu_assign_pointer(pn->leaf, pn_leaf);
+			}
 		}
 #endif
 		goto failure;

From 6926e041a8920c8ec27e4e155efa760aa01551fd Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Wed, 3 Jan 2018 23:14:21 +0100
Subject: [PATCH 270/305] uapi/if_ether.h: prevent redefinition of struct
 ethhdr

Musl provides its own ethhdr struct definition. Add a guard to prevent
its definition of the appropriate musl header has already been included.

glibc does not implement this header, but when glibc will implement this
they can just define __UAPI_DEF_ETHHDR 0 to make it work with the
kernel.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h    | 3 +++
 include/uapi/linux/libc-compat.h | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 3ee3bf7c8526..144de4d2f385 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -23,6 +23,7 @@
 #define _UAPI_LINUX_IF_ETHER_H
 
 #include <linux/types.h>
+#include <linux/libc-compat.h>
 
 /*
  *	IEEE 802.3 Ethernet magic constants.  The frame sizes omit the preamble
@@ -149,11 +150,13 @@
  *	This is an Ethernet frame header.
  */
 
+#if __UAPI_DEF_ETHHDR
 struct ethhdr {
 	unsigned char	h_dest[ETH_ALEN];	/* destination eth addr	*/
 	unsigned char	h_source[ETH_ALEN];	/* source ether addr	*/
 	__be16		h_proto;		/* packet type ID field	*/
 } __attribute__((packed));
+#endif
 
 
 #endif /* _UAPI_LINUX_IF_ETHER_H */
diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
index 8254c937c9f4..fc29efaa918c 100644
--- a/include/uapi/linux/libc-compat.h
+++ b/include/uapi/linux/libc-compat.h
@@ -264,4 +264,10 @@
 
 #endif /* __GLIBC__ */
 
+/* Definitions for if_ether.h */
+/* allow libcs like musl to deactivate this, glibc does not implement this. */
+#ifndef __UAPI_DEF_ETHHDR
+#define __UAPI_DEF_ETHHDR		1
+#endif
+
 #endif /* _UAPI_LIBC_COMPAT_H */

From f5a40711fa58f1c109165a4fec6078bf2dfd2bdc Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 28 Dec 2017 19:06:20 +0300
Subject: [PATCH 271/305] x86/mm: Set MODULES_END to 0xffffffffff000000

Since f06bdd4001c2 ("x86/mm: Adapt MODULES_END based on fixmap section size")
kasan_mem_to_shadow(MODULES_END) could be not aligned to a page boundary.

So passing page unaligned address to kasan_populate_zero_shadow() have two
possible effects:

1) It may leave one page hole in supposed to be populated area. After commit
  21506525fb8d ("x86/kasan/64: Teach KASAN about the cpu_entry_area") that
  hole happens to be in the shadow covering fixmap area and leads to crash:

 BUG: unable to handle kernel paging request at fffffbffffe8ee04
 RIP: 0010:check_memory_region+0x5c/0x190

 Call Trace:
  <NMI>
  memcpy+0x1f/0x50
  ghes_copy_tofrom_phys+0xab/0x180
  ghes_read_estatus+0xfb/0x280
  ghes_notify_nmi+0x2b2/0x410
  nmi_handle+0x115/0x2c0
  default_do_nmi+0x57/0x110
  do_nmi+0xf8/0x150
  end_repeat_nmi+0x1a/0x1e

Note, the crash likely disappeared after commit 92a0f81d8957, which
changed kasan_populate_zero_shadow() call the way it was before
commit 21506525fb8d.

2) Attempt to load module near MODULES_END will fail, because
   __vmalloc_node_range() called from kasan_module_alloc() will hit the
   WARN_ON(!pte_none(*pte)) in the vmap_pte_range() and bail out with error.

To fix this we need to make kasan_mem_to_shadow(MODULES_END) page aligned
which means that MODULES_END should be 8*PAGE_SIZE aligned.

The whole point of commit f06bdd4001c2 was to move MODULES_END down if
NR_CPUS is big, so the cpu_entry_area takes a lot of space.
But since 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap")
the cpu_entry_area is no longer in fixmap, so we could just set
MODULES_END to a fixed 8*PAGE_SIZE aligned address.

Fixes: f06bdd4001c2 ("x86/mm: Adapt MODULES_END based on fixmap section size")
Reported-by: Jakub Kicinski <kubakici@wp.pl>
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Garnier <thgarnie@google.com>
Link: https://lkml.kernel.org/r/20171228160620.23818-1-aryabinin@virtuozzo.com
---
 Documentation/x86/x86_64/mm.txt         | 5 +----
 arch/x86/include/asm/pgtable_64_types.h | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index ad41b3813f0a..ddd5ffd31bd0 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -43,7 +43,7 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space
+ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
 [fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
 ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
@@ -67,9 +67,6 @@ memory window (this size is arbitrary, it can be raised later if needed).
 The mappings are not part of any other kernel PGD and are only available
 during EFI runtime calls.
 
-The module mapping space size changes based on the CONFIG requirements for the
-following fixmap section.
-
 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
 physical memory, vmalloc/ioremap space and virtual memory map are randomized.
 Their order is preserved but their base will be offset early at boot time.
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index b97a539bcdee..6233e5595389 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -104,7 +104,7 @@ typedef struct { pteval_t pte; } pte_t;
 
 #define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)
 /* The module sections ends with the start of the fixmap */
-#define MODULES_END		__fix_to_virt(__end_of_fixed_addresses + 1)
+#define MODULES_END		_AC(0xffffffffff000000, UL)
 #define MODULES_LEN		(MODULES_END - MODULES_VADDR)
 
 #define ESPFIX_PGD_ENTRY	_AC(-2, UL)

From f2078904810373211fb15f91888fba14c01a4acc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Jan 2018 13:01:40 +0100
Subject: [PATCH 272/305] x86/mm: Map cpu_entry_area at the same place on 4/5
 level

There is no reason for 4 and 5 level pagetables to have a different
layout. It just makes determining vaddr_end for KASLR harder than
necessary.

Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Gilbert <benjamin.gilbert@coreos.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: stable <stable@vger.kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Garnier <thgarnie@google.com>,
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos
---
 Documentation/x86/x86_64/mm.txt         | 7 ++++---
 arch/x86/include/asm/pgtable_64_types.h | 4 ++--
 arch/x86/mm/dump_pagetables.c           | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index ddd5ffd31bd0..f7dabe1f01e9 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,8 +12,8 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
-fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
-fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
+fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
+fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
@@ -37,7 +37,8 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ... unused hole ...
 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
-fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
+fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
+... unused hole ...
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6233e5595389..61b4b60bdc13 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -88,7 +88,7 @@ typedef struct { pteval_t pte; } pte_t;
 # define VMALLOC_SIZE_TB	_AC(32, UL)
 # define __VMALLOC_BASE		_AC(0xffffc90000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL)
-# define LDT_PGD_ENTRY		_AC(-4, UL)
+# define LDT_PGD_ENTRY		_AC(-3, UL)
 # define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 #endif
 
@@ -110,7 +110,7 @@ typedef struct { pteval_t pte; } pte_t;
 #define ESPFIX_PGD_ENTRY	_AC(-2, UL)
 #define ESPFIX_BASE_ADDR	(ESPFIX_PGD_ENTRY << P4D_SHIFT)
 
-#define CPU_ENTRY_AREA_PGD	_AC(-3, UL)
+#define CPU_ENTRY_AREA_PGD	_AC(-4, UL)
 #define CPU_ENTRY_AREA_BASE	(CPU_ENTRY_AREA_PGD << P4D_SHIFT)
 
 #define EFI_VA_START		( -4 * (_AC(1, UL) << 30))
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index f56902c1f04b..2a4849e92831 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -61,10 +61,10 @@ enum address_markers_idx {
 	KASAN_SHADOW_START_NR,
 	KASAN_SHADOW_END_NR,
 #endif
+	CPU_ENTRY_AREA_NR,
 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
 	LDT_NR,
 #endif
-	CPU_ENTRY_AREA_NR,
 #ifdef CONFIG_X86_ESPFIX64
 	ESPFIX_START_NR,
 #endif

From 1dddd25125112ba49706518ac9077a1026a18f37 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Jan 2018 12:32:03 +0100
Subject: [PATCH 273/305] x86/kaslr: Fix the vaddr_end mess

vaddr_end for KASLR is only documented in the KASLR code itself and is
adjusted depending on config options. So it's not surprising that a change
of the memory layout causes KASLR to have the wrong vaddr_end. This can map
arbitrary stuff into other areas causing hard to understand problems.

Remove the whole ifdef magic and define the start of the cpu_entry_area to
be the end of the KASLR vaddr range.

Add documentation to that effect.

Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap")
Reported-by: Benjamin Gilbert <benjamin.gilbert@coreos.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Benjamin Gilbert <benjamin.gilbert@coreos.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: stable <stable@vger.kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Garnier <thgarnie@google.com>,
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos
---
 Documentation/x86/x86_64/mm.txt         |  6 +++++
 arch/x86/include/asm/pgtable_64_types.h |  8 ++++++-
 arch/x86/mm/kaslr.c                     | 32 +++++++------------------
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index f7dabe1f01e9..ea91cb61a602 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
+				    vaddr_end for KASLR
 fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
 fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
@@ -37,6 +38,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ... unused hole ...
 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
+				    vaddr_end for KASLR
 fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
 ... unused hole ...
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
@@ -71,3 +73,7 @@ during EFI runtime calls.
 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
 physical memory, vmalloc/ioremap space and virtual memory map are randomized.
 Their order is preserved but their base will be offset early at boot time.
+
+Be very careful vs. KASLR when changing anything here. The KASLR address
+range must not overlap with anything except the KASAN shadow area, which is
+correct as KASAN disables KASLR.
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 61b4b60bdc13..6b8f73dcbc2c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -75,7 +75,13 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
-/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+/*
+ * See Documentation/x86/x86_64/mm.txt for a description of the memory map.
+ *
+ * Be very careful vs. KASLR when changing anything here. The KASLR address
+ * range must not overlap with anything except the KASAN shadow area, which
+ * is correct as KASAN disables KASLR.
+ */
 #define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 
 #ifdef CONFIG_X86_5LEVEL
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 879ef930e2c2..aedebd2ebf1e 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -34,25 +34,14 @@
 #define TB_SHIFT 40
 
 /*
- * Virtual address start and end range for randomization. The end changes base
- * on configuration to have the highest amount of space for randomization.
- * It increases the possible random position for each randomized region.
+ * Virtual address start and end range for randomization.
  *
- * You need to add an if/def entry if you introduce a new memory region
- * compatible with KASLR. Your entry must be in logical order with memory
- * layout. For example, ESPFIX is before EFI because its virtual address is
- * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
- * ensure that this order is correct and won't be changed.
+ * The end address could depend on more configuration options to make the
+ * highest amount of space for randomization available, but that's too hard
+ * to keep straight and caused issues already.
  */
 static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
-
-#if defined(CONFIG_X86_ESPFIX64)
-static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
-#elif defined(CONFIG_EFI)
-static const unsigned long vaddr_end = EFI_VA_END;
-#else
-static const unsigned long vaddr_end = __START_KERNEL_map;
-#endif
+static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
 
 /* Default values */
 unsigned long page_offset_base = __PAGE_OFFSET_BASE;
@@ -101,15 +90,12 @@ void __init kernel_randomize_memory(void)
 	unsigned long remain_entropy;
 
 	/*
-	 * All these BUILD_BUG_ON checks ensures the memory layout is
-	 * consistent with the vaddr_start/vaddr_end variables.
+	 * These BUILD_BUG_ON checks ensure the memory layout is consistent
+	 * with the vaddr_start/vaddr_end variables. These checks are very
+	 * limited....
 	 */
 	BUILD_BUG_ON(vaddr_start >= vaddr_end);
-	BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) &&
-		     vaddr_end >= EFI_VA_END);
-	BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) ||
-		      IS_ENABLED(CONFIG_EFI)) &&
-		     vaddr_end >= __START_KERNEL_map);
+	BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
 	BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
 
 	if (!kaslr_memory_enabled())

From 42f3bdc5dd962a5958bc024c1e1444248a6b8b4a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 4 Jan 2018 18:07:12 +0100
Subject: [PATCH 274/305] x86/events/intel/ds: Use the proper cache flush
 method for mapping ds buffers

Thomas reported the following warning:

 BUG: using smp_processor_id() in preemptible [00000000] code: ovsdb-server/4498
 caller is native_flush_tlb_single+0x57/0xc0
 native_flush_tlb_single+0x57/0xc0
 __set_pte_vaddr+0x2d/0x40
 set_pte_vaddr+0x2f/0x40
 cea_set_pte+0x30/0x40
 ds_update_cea.constprop.4+0x4d/0x70
 reserve_ds_buffers+0x159/0x410
 x86_reserve_hardware+0x150/0x160
 x86_pmu_event_init+0x3e/0x1f0
 perf_try_init_event+0x69/0x80
 perf_event_alloc+0x652/0x740
 SyS_perf_event_open+0x3f6/0xd60
 do_syscall_64+0x5c/0x190

set_pte_vaddr is used to map the ds buffers into the cpu entry area, but
there are two problems with that:

 1) The resulting flush is not supposed to be called in preemptible context

 2) The cpu entry area is supposed to be per CPU, but the debug store
    buffers are mapped for all CPUs so these mappings need to be flushed
    globally.

Add the necessary preemption protection across the mapping code and flush
TLBs globally.

Fixes: c1961a4631da ("x86/events/intel/ds: Map debug buffers in cpu_entry_area")
Reported-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180104170712.GB3040@hirez.programming.kicks-ass.net
---
 arch/x86/events/intel/ds.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8f0aace08b87..8156e47da7ba 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -5,6 +5,7 @@
 
 #include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
+#include <asm/tlbflush.h>
 #include <asm/insn.h>
 
 #include "../perf_event.h"
@@ -283,20 +284,35 @@ static DEFINE_PER_CPU(void *, insn_buffer);
 
 static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
 {
+	unsigned long start = (unsigned long)cea;
 	phys_addr_t pa;
 	size_t msz = 0;
 
 	pa = virt_to_phys(addr);
+
+	preempt_disable();
 	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
 		cea_set_pte(cea, pa, prot);
+
+	/*
+	 * This is a cross-CPU update of the cpu_entry_area, we must shoot down
+	 * all TLB entries for it.
+	 */
+	flush_tlb_kernel_range(start, start + size);
+	preempt_enable();
 }
 
 static void ds_clear_cea(void *cea, size_t size)
 {
+	unsigned long start = (unsigned long)cea;
 	size_t msz = 0;
 
+	preempt_disable();
 	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
 		cea_set_pte(cea, 0, PAGE_NONE);
+
+	flush_tlb_kernel_range(start, start + size);
+	preempt_enable();
 }
 
 static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)

From 1e5476815fd7f98b888e01a0f9522b63085f96c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Jan 2018 22:19:04 +0100
Subject: [PATCH 275/305] x86/tlb: Drop the _GPL from the cpu_tlbstate export

The recent changes for PTI touch cpu_tlbstate from various tlb_flush
inlines. cpu_tlbstate is exported as GPL symbol, so this causes a
regression when building out of tree drivers for certain graphics cards.

Aside of that the export was wrong since it was introduced as it should
have been EXPORT_PER_CPU_SYMBOL_GPL().

Use the correct PER_CPU export and drop the _GPL to restore the previous
state which allows users to utilize the cards they payed for.

As always I'm really thrilled to make this kind of change to support the
#friends (or however the hot hashtag of today is spelled) from that closet
sauce graphics corp.

Fixes: 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4")
Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches")
Reported-by: Kees Cook <keescook@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: stable@vger.kernel.org
---
 arch/x86/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 80259ad8c386..6b462a472a7b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -870,7 +870,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.next_asid = 1,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
 };
-EXPORT_SYMBOL_GPL(cpu_tlbstate);
+EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
 
 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
 {

From e8c24773d6b2cd9bc8b36bd6e60beff599be14be Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Thu, 4 Jan 2018 16:17:45 -0800
Subject: [PATCH 276/305] mm: check pfn_valid first in zero_resv_unavail

With latest kernel I get below bug while testing kdump:

  BUG: unable to handle kernel paging request at ffffea00034b1040
  IP: zero_resv_unavail+0xbd/0x126
  PGD 37b98067 P4D 37b98067 PUD 37b97067 PMD 0
  Oops: 0002 [#1] SMP
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.15.0-rc1+ #316
  Hardware name: LENOVO 20ARS1BJ02/20ARS1BJ02, BIOS GJET92WW (2.42 ) 03/03/2017
  task: ffffffff81a0e4c0 task.stack: ffffffff81a00000
  RIP: 0010:zero_resv_unavail+0xbd/0x126
  RSP: 0000:ffffffff81a03d88 EFLAGS: 00010006
  RAX: 0000000000000000 RBX: ffffea00034b1040 RCX: 0000000000000010
  RDX: 0000000000000000 RSI: 0000000000000092 RDI: ffffea00034b1040
  RBP: 00000000000d2c41 R08: 00000000000000c0 R09: 0000000000000a0d
  R10: 0000000000000002 R11: 0000000000007f01 R12: ffffffff81a03d90
  R13: ffffea0000000000 R14: 0000000000000063 R15: 0000000000000062
  FS:  0000000000000000(0000) GS:ffffffff81c73000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffffea00034b1040 CR3: 0000000037609000 CR4: 00000000000606b0
  Call Trace:
   ? free_area_init_nodes+0x640/0x664
   ? zone_sizes_init+0x58/0x72
   ? setup_arch+0xb50/0xc6c
   ? start_kernel+0x64/0x43d
   ? secondary_startup_64+0xa5/0xb0
  Code: c1 e8 0c 48 39 d8 76 27 48 89 de 48 c1 e3 06 48 c7 c7 7a 87 79 81 e8 b0 c0 3e ff 4c 01 eb b9 10 00 00 00 31 c0 48 89 df 49 ff c6 <f3> ab eb bc 6a 00 49 c7 c0 f0 93 d1 81 31 d2 83 ce ff 41 54 49
  RIP: zero_resv_unavail+0xbd/0x126 RSP: ffffffff81a03d88
  CR2: ffffea00034b1040
  ---[ end trace f5ba9e8f73c7ee26 ]---

This is introduced by commit a4a3ede2132a ("mm: zero reserved and
unavailable struct pages").

The reason is some efi reserved boot ranges is not reported in E820 ram.
In my case it is a bgrt buffer:

  efi: mem00: [Boot Data          |RUN|  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x00000000d2c41000-0x00000000d2c85fff] (0MB)

Use "add_efi_memmap" can workaround the problem with another fix:

  http://lkml.kernel.org/r/20171130052327.GA3500@dhcp-128-65.nay.redhat.com

In zero_resv_unavail it would be better to check pfn_valid first before
zero the page struct.  This fixes the problem and potential other
similar problems.  Also as Pavel Tatashin suggested checks pfn_valid at
the beginning of the section.

The range is backed by real memory.  The memory range is efi "Boot
Service Data", that means after ExitBootServices() these ranges can be
used as system ram.  But some of them need to be reserved, for example
the bgrt image address in an acpi table, if the image memory is freed
then kexec reboot will fail because kexec inherit same acpi table to
initialize the driver.

Link: http://lkml.kernel.org/r/20171201095048.GA3084@dhcp-128-65.nay.redhat.com
Fixes: a4a3ede2132a ("mm: zero reserved and unavailable struct pages")
Signed-off-by: Dave Young <dyoung@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e5e775e97f4..76c9688b6a0a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6260,6 +6260,8 @@ void __paginginit zero_resv_unavail(void)
 	pgcnt = 0;
 	for_each_resv_unavail_range(i, &start, &end) {
 		for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
+			if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages)))
+				continue;
 			mm_zero_struct_page(pfn_to_page(pfn));
 			pgcnt++;
 		}

From 4d9570158b6260f449e317a5f9ed030c2504a615 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 4 Jan 2018 16:17:49 -0800
Subject: [PATCH 277/305] kernel/acct.c: fix the acct->needcheck check in
 check_free_space()

As Tsukada explains, the time_is_before_jiffies(acct->needcheck) check
is very wrong, we need time_is_after_jiffies() to make sys_acct() work.

Ignoring the overflows, the code should "goto out" if needcheck >
jiffies, while currently it checks "needcheck < jiffies" and thus in the
likely case check_free_space() does nothing until jiffies overflow.

In particular this means that sys_acct() is simply broken, acct_on()
sets acct->needcheck = jiffies and expects that check_free_space()
should set acct->active = 1 after the free-space check, but this won't
happen if jiffies increments in between.

This was broken by commit 32dc73086015 ("get rid of timer in
kern/acct.c") in 2011, then another (correct) commit 795a2f22a8ea
("acct() should honour the limits from the very beginning") made the
problem more visible.

Link: http://lkml.kernel.org/r/20171213133940.GA6554@redhat.com
Fixes: 32dc73086015 ("get rid of timer in kern/acct.c")
Reported-by: TSUKADA Koutaro <tsukada@ascade.co.jp>
Suggested-by: TSUKADA Koutaro <tsukada@ascade.co.jp>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/acct.c b/kernel/acct.c
index d15c0ee4d955..addf7732fb56 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct)
 {
 	struct kstatfs sbuf;
 
-	if (time_is_before_jiffies(acct->needcheck))
+	if (time_is_after_jiffies(acct->needcheck))
 		goto out;
 
 	/* May block */

From 4991c09c7c812dba13ea9be79a68b4565bb1fa4e Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 4 Jan 2018 16:17:52 -0800
Subject: [PATCH 278/305] mm/mprotect: add a cond_resched() inside
 change_pmd_range()

While testing on a large CPU system, detected the following RCU stall
many times over the span of the workload.  This problem is solved by
adding a cond_resched() in the change_pmd_range() function.

  INFO: rcu_sched detected stalls on CPUs/tasks:
   154-....: (670 ticks this GP) idle=022/140000000000000/0 softirq=2825/2825 fqs=612
   (detected by 955, t=6002 jiffies, g=4486, c=4485, q=90864)
  Sending NMI from CPU 955 to CPUs 154:
  NMI backtrace for cpu 154
  CPU: 154 PID: 147071 Comm: workload Not tainted 4.15.0-rc3+ #3
  NIP:  c0000000000b3f64 LR: c0000000000b33d4 CTR: 000000000000aa18
  REGS: 00000000a4b0fb44 TRAP: 0501   Not tainted  (4.15.0-rc3+)
  MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22422082  XER: 00000000
  CFAR: 00000000006cf8f0 SOFTE: 1
  GPR00: 0010000000000000 c00003ef9b1cb8c0 c0000000010cc600 0000000000000000
  GPR04: 8e0000018c32b200 40017b3858fd6e00 8e0000018c32b208 40017b3858fd6e00
  GPR08: 8e0000018c32b210 40017b3858fd6e00 8e0000018c32b218 40017b3858fd6e00
  GPR12: ffffffffffffffff c00000000fb25100
  NIP [c0000000000b3f64] plpar_hcall9+0x44/0x7c
  LR [c0000000000b33d4] pSeries_lpar_flush_hash_range+0x384/0x420
  Call Trace:
    flush_hash_range+0x48/0x100
    __flush_tlb_pending+0x44/0xd0
    hpte_need_flush+0x408/0x470
    change_protection_range+0xaac/0xf10
    change_prot_numa+0x30/0xb0
    task_numa_work+0x2d0/0x3e0
    task_work_run+0x130/0x190
    do_notify_resume+0x118/0x120
    ret_from_except_lite+0x70/0x74
  Instruction dump:
  60000000 f8810028 7ca42b78 7cc53378 7ce63b78 7d074378 7d284b78 7d495378
  e9410060 e9610068 e9810070 44000022 <7d806378> e9810028 f88c0000 f8ac0008

Link: http://lkml.kernel.org/r/20171214140551.5794-1-khandual@linux.vnet.ibm.com
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Suggested-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mprotect.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index ec39f730a0bf..58b629bb70de 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -166,7 +166,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 		next = pmd_addr_end(addr, end);
 		if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
 				&& pmd_none_or_clear_bad(pmd))
-			continue;
+			goto next;
 
 		/* invoke the mmu notifier if the pmd is populated */
 		if (!mni_start) {
@@ -188,7 +188,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 					}
 
 					/* huge pmd was handled */
-					continue;
+					goto next;
 				}
 			}
 			/* fall through, the trans huge pmd just split */
@@ -196,6 +196,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 		this_pages = change_pte_range(vma, pmd, addr, next, newprot,
 				 dirty_accountable, prot_numa);
 		pages += this_pages;
+next:
+		cond_resched();
 	} while (pmd++, addr = next, addr != end);
 
 	if (mni_start)

From dc8635b78cd8669c37e230058d18c33af7451ab1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 4 Jan 2018 16:17:56 -0800
Subject: [PATCH 279/305] kernel/exit.c: export abort() to modules

gcc -fisolate-erroneous-paths-dereference can generate calls to abort()
from modular code too.

[arnd@arndb.de: drop duplicate exports of abort()]
  Link: http://lkml.kernel.org/r/20180102103311.706364-1-arnd@arndb.de
Reported-by: Vineet Gupta <Vineet.Gupta1@synopsys.com>
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Alexey Brodkin <Alexey.Brodkin@synopsys.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Jose Abreu <Jose.Abreu@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/traps.c       | 1 -
 arch/m32r/kernel/traps.c      | 1 -
 arch/unicore32/kernel/traps.c | 1 -
 kernel/exit.c                 | 1 +
 4 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 5cf04888c581..3e26c6f7a191 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -793,7 +793,6 @@ void abort(void)
 	/* if that doesn't kill us, halt */
 	panic("Oops failed to kill thread");
 }
-EXPORT_SYMBOL(abort);
 
 void __init trap_init(void)
 {
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index cb79fba79d43..b88a8dd14933 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -122,7 +122,6 @@ void abort(void)
 	/* if that doesn't kill us, halt */
 	panic("Oops failed to kill thread");
 }
-EXPORT_SYMBOL(abort);
 
 void __init trap_init(void)
 {
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index 5f25b39f04d4..c4ac6043ebb0 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -298,7 +298,6 @@ void abort(void)
 	/* if that doesn't kill us, halt */
 	panic("Oops failed to kill thread");
 }
-EXPORT_SYMBOL(abort);
 
 void __init trap_init(void)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index df0c91d5606c..995453d9fb55 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1763,3 +1763,4 @@ __weak void abort(void)
 	/* if that doesn't kill us, halt */
 	panic("Oops failed to kill thread");
 }
+EXPORT_SYMBOL(abort);

From 152a2d199e1385c6ccef17c24555103b30447c91 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 4 Jan 2018 16:17:59 -0800
Subject: [PATCH 280/305] mm/debug.c: provide useful debugging information for
 VM_BUG

With the recent addition of hashed kernel pointers, places which need to
produce useful debug output have to specify %px, not %p.  This patch
fixes all the VM debug to use %px.  This is appropriate because it's
debug output that the user should never be able to trigger, and kernel
developers need to see the actual pointers.

Link: http://lkml.kernel.org/r/20171219133236.GE13680@bombadil.infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Tobin C. Harding" <me@tobin.cc>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/debug.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/mm/debug.c b/mm/debug.c
index d947f3e03b0d..56e2d9125ea5 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -50,7 +50,7 @@ void __dump_page(struct page *page, const char *reason)
 	 */
 	int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
 
-	pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
+	pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
 		  page, page_ref_count(page), mapcount,
 		  page->mapping, page_to_pgoff(page));
 	if (PageCompound(page))
@@ -69,7 +69,7 @@ void __dump_page(struct page *page, const char *reason)
 
 #ifdef CONFIG_MEMCG
 	if (page->mem_cgroup)
-		pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
+		pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup);
 #endif
 }
 
@@ -84,10 +84,10 @@ EXPORT_SYMBOL(dump_page);
 
 void dump_vma(const struct vm_area_struct *vma)
 {
-	pr_emerg("vma %p start %p end %p\n"
-		"next %p prev %p mm %p\n"
-		"prot %lx anon_vma %p vm_ops %p\n"
-		"pgoff %lx file %p private_data %p\n"
+	pr_emerg("vma %px start %px end %px\n"
+		"next %px prev %px mm %px\n"
+		"prot %lx anon_vma %px vm_ops %px\n"
+		"pgoff %lx file %px private_data %px\n"
 		"flags: %#lx(%pGv)\n",
 		vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
 		vma->vm_prev, vma->vm_mm,
@@ -100,27 +100,27 @@ EXPORT_SYMBOL(dump_vma);
 
 void dump_mm(const struct mm_struct *mm)
 {
-	pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
+	pr_emerg("mm %px mmap %px seqnum %d task_size %lu\n"
 #ifdef CONFIG_MMU
-		"get_unmapped_area %p\n"
+		"get_unmapped_area %px\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
+		"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
 		"start_brk %lx brk %lx start_stack %lx\n"
 		"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
-		"binfmt %p flags %lx core_state %p\n"
+		"binfmt %px flags %lx core_state %px\n"
 #ifdef CONFIG_AIO
-		"ioctx_table %p\n"
+		"ioctx_table %px\n"
 #endif
 #ifdef CONFIG_MEMCG
-		"owner %p "
+		"owner %px "
 #endif
-		"exe_file %p\n"
+		"exe_file %px\n"
 #ifdef CONFIG_MMU_NOTIFIER
-		"mmu_notifier_mm %p\n"
+		"mmu_notifier_mm %px\n"
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 		"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"

From cdc346b36e1dfec201b24eddb7bdbcff6727db04 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Thu, 4 Jan 2018 16:18:02 -0800
Subject: [PATCH 281/305] mm/zsmalloc.c: include fs.h

`struct file_system_type' and alloc_anon_inode() function are defined in
fs.h, include it directly.

Link: http://lkml.kernel.org/r/20171219104219.3017-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zsmalloc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 685049a9048d..683c0651098c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -53,6 +53,7 @@
 #include <linux/mount.h>
 #include <linux/migrate.h>
 #include <linux/pagemap.h>
+#include <linux/fs.h>
 
 #define ZSPAGE_MAGIC	0x58
 

From d09cfbbfa0f761a97687828b5afb27b56cbf2e19 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 4 Jan 2018 16:18:06 -0800
Subject: [PATCH 282/305] mm/sparse.c: wrong allocation for mem_section

In commit 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime
for CONFIG_SPARSEMEM_EXTREME=y") mem_section is allocated at runtime to
save memory.

It allocates the first dimension of array with sizeof(struct mem_section).

It costs extra memory, should be sizeof(struct mem_section *).

Fix it.

Link: http://lkml.kernel.org/r/1513932498-20350-1-git-send-email-bhe@redhat.com
Fixes: 83e3c48729 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y")
Signed-off-by: Baoquan He <bhe@redhat.com>
Tested-by: Dave Young <dyoung@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/sparse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/sparse.c b/mm/sparse.c
index 7a5dacaa06e3..2609aba121e8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -211,7 +211,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
 	if (unlikely(!mem_section)) {
 		unsigned long size, align;
 
-		size = sizeof(struct mem_section) * NR_SECTION_ROOTS;
+		size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
 		align = 1 << (INTERNODE_CACHE_SHIFT);
 		mem_section = memblock_virt_alloc(size, align);
 	}

From 0cbb4b4f4c44f54af268969b18d8deda63aded59 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 4 Jan 2018 16:18:09 -0800
Subject: [PATCH 283/305] userfaultfd: clear the vma->vm_userfaultfd_ctx if
 UFFD_EVENT_FORK fails

The previous fix in commit 384632e67e08 ("userfaultfd: non-cooperative:
fix fork use after free") corrected the refcounting in case of
UFFD_EVENT_FORK failure for the fork userfault paths.

That still didn't clear the vma->vm_userfaultfd_ctx of the vmas that
were set to point to the aborted new uffd ctx earlier in
dup_userfaultfd.

Link: http://lkml.kernel.org/r/20171223002505.593-2-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ac9a4e65ca49..41a75f9f23fd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -570,11 +570,14 @@ out:
 static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 					      struct userfaultfd_wait_queue *ewq)
 {
+	struct userfaultfd_ctx *release_new_ctx;
+
 	if (WARN_ON_ONCE(current->flags & PF_EXITING))
 		goto out;
 
 	ewq->ctx = ctx;
 	init_waitqueue_entry(&ewq->wq, current);
+	release_new_ctx = NULL;
 
 	spin_lock(&ctx->event_wqh.lock);
 	/*
@@ -601,8 +604,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 				new = (struct userfaultfd_ctx *)
 					(unsigned long)
 					ewq->msg.arg.reserved.reserved1;
-
-				userfaultfd_ctx_put(new);
+				release_new_ctx = new;
 			}
 			break;
 		}
@@ -617,6 +619,20 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock(&ctx->event_wqh.lock);
 
+	if (release_new_ctx) {
+		struct vm_area_struct *vma;
+		struct mm_struct *mm = release_new_ctx->mm;
+
+		/* the various vma->vm_userfaultfd_ctx still points to it */
+		down_write(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next)
+			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx)
+				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		up_write(&mm->mmap_sem);
+
+		userfaultfd_ctx_put(release_new_ctx);
+	}
+
 	/*
 	 * ctx may go away after this if the userfault pseudo fd is
 	 * already released.

From 9a0e7120109632910e77295ce6fc512c16cd367b Mon Sep 17 00:00:00 2001
From: Jeffy Chen <jeffy.chen@rock-chips.com>
Date: Thu, 4 Jan 2018 16:18:12 -0800
Subject: [PATCH 284/305] mailmap: update Mark Yao's email address

Change the previous employers email addresses to the current email
address.

Link: http://lkml.kernel.org/r/20171229121726.31589-1-jeffy.chen@rock-chips.com
Signed-off-by: Jeffy Chen <jeffy.chen@rock-chips.com>
Acked-by: Martin Kepplinger <martink@posteo.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .mailmap | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.mailmap b/.mailmap
index 1469ff0d3f4d..e18cab73e209 100644
--- a/.mailmap
+++ b/.mailmap
@@ -107,6 +107,7 @@ Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
 Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
 Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com>
 Mark Brown <broonie@sirena.org.uk>
+Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
 Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
 Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
 Matthieu CASTET <castet.matthieu@free.fr>

From 107b7d9fa94c4692d9104243f0e793e2a4e1366e Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Wed, 3 Jan 2018 07:32:45 -0500
Subject: [PATCH 285/305] mfd: rtsx: Release IRQ during shutdown

'Commit cc27b735ad3a ("PCI/portdrv: Turn off PCIe services during
shutdown")' revealed a resource leak in rtsx_pci driver during shutdown.

Issue shows up as a warning during shutdown as follows:

remove_proc_entry: removing non-empty directory 'irq/17', leaking at least
'rtsx_pci'
WARNING: CPU: 0 PID: 1578 at fs/proc/generic.c:572
remove_proc_entry+0x11d/0x130
Modules linked in <long list but none that are out-of-tree>
...
Call Trace:
unregister_irq_proc
free_desc
irq_free_descs
mp_unmap_irq
acpi_unregister_gsi_apic
acpi_pci_irq_disable
do_pci_disable_device
pci_disable_device
device_shutdown
kernel_restart
Sys_reboot

Even though rtsx_pci driver implements a shutdown callback, it is not
releasing the interrupt that it registered during probe. This is causing
the ACPI layer to complain that the shared IRQ is in use while freeing
IRQ.

This code releases the IRQ to prevent resource leak and eliminate the
warning.

Fixes: cc27b735ad3a ("PCI/portdrv: Turn off PCIe services during shutdown")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=198141
Reported-by: Chris Clayton <chris2553@googlemail.com>
Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rtsx_pcr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/mfd/rtsx_pcr.c b/drivers/mfd/rtsx_pcr.c
index 590fb9aad77d..c3ed885c155c 100644
--- a/drivers/mfd/rtsx_pcr.c
+++ b/drivers/mfd/rtsx_pcr.c
@@ -1543,6 +1543,9 @@ static void rtsx_pci_shutdown(struct pci_dev *pcidev)
 	rtsx_pci_power_off(pcr, HOST_ENTER_S1);
 
 	pci_disable_device(pcidev);
+	free_irq(pcr->irq, (void *)pcr);
+	if (pcr->msi_en)
+		pci_disable_msi(pcr->pci);
 }
 
 #else /* CONFIG_PM */

From b9e705ef7cfaf22db0daab91ad3cd33b0fa32eb9 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 4 Jan 2018 14:37:05 +0000
Subject: [PATCH 286/305] x86/alternatives: Add missing '\n' at end of
 ALTERNATIVE inline asm

Where an ALTERNATIVE is used in the middle of an inline asm block, this
would otherwise lead to the following instruction being appended directly
to the trailing ".popsection", and a failed compile.

Fixes: 9cebed423c84 ("x86, alternative: Use .pushsection/.popsection")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: ak@linux.intel.com
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180104143710.8961-8-dwmw@amazon.co.uk
---
 arch/x86/include/asm/alternative.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index dbfd0854651f..cf5961ca8677 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -140,7 +140,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr, feature, 1)			\
-	".popsection"
+	".popsection\n"
 
 #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
 	OLDINSTR_2(oldinstr, 1, 2)					\
@@ -151,7 +151,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)			\
 	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\
-	".popsection"
+	".popsection\n"
 
 /*
  * Alternative instructions for different CPU types or capabilities.

From de791821c295cc61419a06fe5562288417d1bc58 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 5 Jan 2018 15:27:34 +0100
Subject: [PATCH 287/305] x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN

Use the name associated with the particular attack which needs page table
isolation for mitigation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Jiri Koshina <jikos@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Andi Lutomirski  <luto@amacapital.net>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Greg KH <gregkh@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801051525300.1724@nanos
---
 arch/x86/include/asm/cpufeatures.h | 2 +-
 arch/x86/kernel/cpu/common.c       | 2 +-
 arch/x86/mm/pti.c                  | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 07cdd1715705..21ac898df2d8 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -341,6 +341,6 @@
 #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
-#define X86_BUG_CPU_INSECURE		X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
+#define X86_BUG_CPU_MELTDOWN		X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b1be494ab4e8..2d3bd2215e5b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -900,7 +900,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
 	if (c->x86_vendor != X86_VENDOR_AMD)
-		setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+		setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 
 	fpu__init_system(c);
 
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 2da28ba97508..43d4a4a29037 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -56,13 +56,13 @@
 
 static void __init pti_print_if_insecure(const char *reason)
 {
-	if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+	if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 		pr_info("%s\n", reason);
 }
 
 static void __init pti_print_if_secure(const char *reason)
 {
-	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 		pr_info("%s\n", reason);
 }
 
@@ -96,7 +96,7 @@ void __init pti_check_boottime_disable(void)
 	}
 
 autosel:
-	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 		return;
 enable:
 	setup_force_cpu_cap(X86_FEATURE_PTI);

From 0cb5b30698fdc8f6b4646012e3acb4ddce430788 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 3 Jan 2018 14:31:38 -0800
Subject: [PATCH 288/305] kvm: vmx: Scrub hardware GPRs at VM-exit

Guest GPR values are live in the hardware GPRs at VM-exit.  Do not
leave any guest values in hardware GPRs after the guest GPR values are
saved to the vcpu_vmx structure.

This is a partial mitigation for CVE 2017-5715 and CVE 2017-5753.
Specifically, it defeats the Project Zero PoC for CVE 2017-5715.

Suggested-by: Eric Northup <digitaleric@google.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Eric Northup <digitaleric@google.com>
Reviewed-by: Benjamin Serebrin <serebrin@google.com>
Reviewed-by: Andrew Honig <ahonig@google.com>
[Paolo: Add AMD bits, Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 19 +++++++++++++++++++
 arch/x86/kvm/vmx.c | 14 +++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index eb714f1cdf7e..bb31c801f1fc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4985,6 +4985,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		"mov %%r13, %c[r13](%[svm]) \n\t"
 		"mov %%r14, %c[r14](%[svm]) \n\t"
 		"mov %%r15, %c[r15](%[svm]) \n\t"
+#endif
+		/*
+		* Clear host registers marked as clobbered to prevent
+		* speculative use.
+		*/
+		"xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
+		"xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
+		"xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
+		"xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+		"xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
+#ifdef CONFIG_X86_64
+		"xor %%r8, %%r8 \n\t"
+		"xor %%r9, %%r9 \n\t"
+		"xor %%r10, %%r10 \n\t"
+		"xor %%r11, %%r11 \n\t"
+		"xor %%r12, %%r12 \n\t"
+		"xor %%r13, %%r13 \n\t"
+		"xor %%r14, %%r14 \n\t"
+		"xor %%r15, %%r15 \n\t"
 #endif
 		"pop %%" _ASM_BP
 		:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8eba631c4dbd..c1e7ed371259 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9415,6 +9415,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		/* Save guest registers, load host registers, keep flags */
 		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
 		"pop %0 \n\t"
+		"setbe %c[fail](%0)\n\t"
 		"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
 		"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
 		__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
@@ -9431,12 +9432,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		"mov %%r13, %c[r13](%0) \n\t"
 		"mov %%r14, %c[r14](%0) \n\t"
 		"mov %%r15, %c[r15](%0) \n\t"
+		"xor %%r8d,  %%r8d \n\t"
+		"xor %%r9d,  %%r9d \n\t"
+		"xor %%r10d, %%r10d \n\t"
+		"xor %%r11d, %%r11d \n\t"
+		"xor %%r12d, %%r12d \n\t"
+		"xor %%r13d, %%r13d \n\t"
+		"xor %%r14d, %%r14d \n\t"
+		"xor %%r15d, %%r15d \n\t"
 #endif
 		"mov %%cr2, %%" _ASM_AX "   \n\t"
 		"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
 
+		"xor %%eax, %%eax \n\t"
+		"xor %%ebx, %%ebx \n\t"
+		"xor %%esi, %%esi \n\t"
+		"xor %%edi, %%edi \n\t"
 		"pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
-		"setbe %c[fail](%0) \n\t"
 		".pushsection .rodata \n\t"
 		".global vmx_return \n\t"
 		"vmx_return: " _ASM_PTR " 2b \n\t"

From d1616f07e8f1a4a490d1791316d4a68906b284aa Mon Sep 17 00:00:00 2001
From: Fugang Duan <fugang.duan@nxp.com>
Date: Thu, 4 Jan 2018 10:47:20 +0800
Subject: [PATCH 289/305] net: fec: free/restore resource in related probe
 error pathes

Fixes in probe error path:
- Restore dev_id before failed_ioremap path.
  Fixes: ("net: fec: restore dev_id in the cases of probe error")
- Call of_node_put(phy_node) before failed_phy path.
  Fixes: ("net: fec: Support phys probed from devicetree and fixed-link")

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 19f198e22e15..a74300a4459c 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3556,11 +3556,11 @@ failed_clk_ipg:
 failed_clk:
 	if (of_phy_is_fixed_link(np))
 		of_phy_deregister_fixed_link(np);
-failed_phy:
 	of_node_put(phy_node);
+failed_phy:
+	dev_id--;
 failed_ioremap:
 	free_netdev(ndev);
-	dev_id--;
 
 	return ret;
 }

From 040ee69226f8a96b7943645d68f41d5d44b5ff7d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Dec 2017 20:20:38 -0500
Subject: [PATCH 290/305] fix "netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED
 mode of 'xt_bpf_info_v1'"

Descriptor table is a shared object; it's not a place where you can
stick temporary references to files, especially when we don't need
an opened file at all.

Cc: stable@vger.kernel.org # v4.14
Fixes: 98589a0998b8 ("netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'")
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/bpf.h    | 10 ++++++++++
 kernel/bpf/inode.c     | 40 +++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c   |  2 +-
 net/netfilter/xt_bpf.c | 14 ++------------
 4 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e4255a210..b63a592ad29d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -419,6 +419,8 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 		attr->numa_node : NUMA_NO_NODE;
 }
 
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -506,6 +508,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 {
 	return 0;
 }
+
+static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
+				enum bpf_prog_type type)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
@@ -514,6 +522,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 	return bpf_prog_get_type_dev(ufd, type, false);
 }
 
+bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool);
+
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
 
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 01aaef1a77c5..5bb5e49ef4c3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -368,7 +368,45 @@ out:
 	putname(pname);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(bpf_obj_get_user);
+
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+	struct bpf_prog *prog;
+	int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (inode->i_op == &bpf_map_iops)
+		return ERR_PTR(-EINVAL);
+	if (inode->i_op != &bpf_prog_iops)
+		return ERR_PTR(-EACCES);
+
+	prog = inode->i_private;
+
+	ret = security_bpf_prog(prog);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (!bpf_prog_get_ok(prog, &type, false))
+		return ERR_PTR(-EINVAL);
+
+	return bpf_prog_inc(prog);
+}
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+	struct bpf_prog *prog;
+	struct path path;
+	int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ERR_PTR(ret);
+	prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+	if (!IS_ERR(prog))
+		touch_atime(&path);
+	path_put(&path);
+	return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
 
 static void bpf_evict_inode(struct inode *inode)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c4cfeaa8d5e..5cb783fc8224 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static bool bpf_prog_get_ok(struct bpf_prog *prog,
+bool bpf_prog_get_ok(struct bpf_prog *prog,
 			    enum bpf_prog_type *attach_type, bool attach_drv)
 {
 	/* not an attachment, just a refcount inc, always allow */
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 041da0d9c06f..fa2ca0a13619 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -52,18 +52,8 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
 
 static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
 {
-	mm_segment_t oldfs = get_fs();
-	int retval, fd;
-
-	set_fs(KERNEL_DS);
-	fd = bpf_obj_get_user(path, 0);
-	set_fs(oldfs);
-	if (fd < 0)
-		return fd;
-
-	retval = __bpf_mt_check_fd(fd, ret);
-	sys_close(fd);
-	return retval;
+	*ret = bpf_prog_get_type_path(path, BPF_PROG_TYPE_SOCKET_FILTER);
+	return PTR_ERR_OR_ZERO(*ret);
 }
 
 static int bpf_mt_check(const struct xt_mtchk_param *par)

From 5133550296d43236439494aa955bfb765a89f615 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Thu, 4 Jan 2018 21:06:49 +0300
Subject: [PATCH 291/305] sh_eth: fix SH7757 GEther initialization

Renesas  SH7757 has 2 Fast and 2 Gigabit Ether controllers, while the
'sh_eth' driver can only reset and initialize TSU of the first controller
pair. Shimoda-san tried to solve that adding the 'needs_init' member to the
'struct sh_eth_plat_data', however the platform code still never sets this
flag. I think  that we can infer this information from the 'devno' variable
(set  to 'platform_device::id') and reset/init the Ether controller pair
only for an even 'devno'; therefore 'sh_eth_plat_data::needs_init' can be
removed...

Fixes: 150647fb2c31 ("net: sh_eth: change the condition of initialization")
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 4 ++--
 include/linux/sh_eth.h                | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 1bdd67a8a869..f21c1db91c3f 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3254,8 +3254,8 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
 		ndev->features = NETIF_F_HW_VLAN_CTAG_FILTER;
 	}
 
-	/* initialize first or needed device */
-	if (!devno || pd->needs_init) {
+	/* Need to init only the first port of the two sharing a TSU */
+	if (devno % 2 == 0) {
 		if (mdp->cd->chip_reset)
 			mdp->cd->chip_reset(ndev);
 
diff --git a/include/linux/sh_eth.h b/include/linux/sh_eth.h
index ff3642d267f7..94081e9a5010 100644
--- a/include/linux/sh_eth.h
+++ b/include/linux/sh_eth.h
@@ -17,7 +17,6 @@ struct sh_eth_plat_data {
 	unsigned char mac_addr[ETH_ALEN];
 	unsigned no_ether_link:1;
 	unsigned ether_link_active_low:1;
-	unsigned needs_init:1;
 };
 
 #endif

From 5b9f57cf47b87f07210875d6a24776b4496b818d Mon Sep 17 00:00:00 2001
From: John Johansen <john.johansen@canonical.com>
Date: Thu, 7 Dec 2017 00:28:27 -0800
Subject: [PATCH 292/305] apparmor: fix regression in mount mediation when
 feature set is pinned
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the mount code was refactored for Labels it was not correctly
updated to check whether policy supported mediation of the mount
class.  This causes a regression when the kernel feature set is
reported as supporting mount and policy is pinned to a feature set
that does not support mount mediation.

BugLink: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=882697#41
Fixes: 2ea3ffb7782a ("apparmor: add mount mediation")
Reported-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Cc: Stable <stable@vger.kernel.org>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/mount.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c
index ed9b4d0f9f7e..8c558cbce930 100644
--- a/security/apparmor/mount.c
+++ b/security/apparmor/mount.c
@@ -329,6 +329,9 @@ static int match_mnt_path_str(struct aa_profile *profile,
 	AA_BUG(!mntpath);
 	AA_BUG(!buffer);
 
+	if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT))
+		return 0;
+
 	error = aa_path_name(mntpath, path_flags(profile, mntpath), buffer,
 			     &mntpnt, &info, profile->disconnected);
 	if (error)
@@ -380,6 +383,9 @@ static int match_mnt(struct aa_profile *profile, const struct path *path,
 	AA_BUG(!profile);
 	AA_BUG(devpath && !devbuffer);
 
+	if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT))
+		return 0;
+
 	if (devpath) {
 		error = aa_path_name(devpath, path_flags(profile, devpath),
 				     devbuffer, &devname, &info,
@@ -558,6 +564,9 @@ static int profile_umount(struct aa_profile *profile, struct path *path,
 	AA_BUG(!profile);
 	AA_BUG(!path);
 
+	if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT))
+		return 0;
+
 	error = aa_path_name(path, path_flags(profile, path), buffer, &name,
 			     &info, profile->disconnected);
 	if (error)
@@ -613,7 +622,8 @@ static struct aa_label *build_pivotroot(struct aa_profile *profile,
 	AA_BUG(!new_path);
 	AA_BUG(!old_path);
 
-	if (profile_unconfined(profile))
+	if (profile_unconfined(profile) ||
+	    !PROFILE_MEDIATES(profile, AA_CLASS_MOUNT))
 		return aa_get_newest_label(&profile->label);
 
 	error = aa_path_name(old_path, path_flags(profile, old_path),

From 310d82784fb4d60c80569f5ca9f53a7f3bf1d477 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 5 Jan 2018 21:55:38 +0100
Subject: [PATCH 293/305] parisc: qemu idle sleep support

Add qemu idle sleep support when running under qemu with SeaBIOS PDC
firmware.

Like the power architecture we use the "or" assembler instructions,
which translate to nops on real hardware, to indicate that qemu shall
idle sleep.

Signed-off-by: Helge Deller <deller@gmx.de>
Cc: Richard Henderson <rth@twiddle.net>
CC: stable@vger.kernel.org # v4.9+
---
 arch/parisc/kernel/process.c | 39 ++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 30f92391a93e..cad3e8661cd6 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -39,6 +39,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/cpu.h>
 #include <linux/module.h>
 #include <linux/personality.h>
 #include <linux/ptrace.h>
@@ -183,6 +184,44 @@ int dump_task_fpu (struct task_struct *tsk, elf_fpregset_t *r)
 	return 1;
 }
 
+/*
+ * Idle thread support
+ *
+ * Detect when running on QEMU with SeaBIOS PDC Firmware and let
+ * QEMU idle the host too.
+ */
+
+int running_on_qemu __read_mostly;
+
+void __cpuidle arch_cpu_idle_dead(void)
+{
+	/* nop on real hardware, qemu will offline CPU. */
+	asm volatile("or %%r31,%%r31,%%r31\n":::);
+}
+
+void __cpuidle arch_cpu_idle(void)
+{
+	local_irq_enable();
+
+	/* nop on real hardware, qemu will idle sleep. */
+	asm volatile("or %%r10,%%r10,%%r10\n":::);
+}
+
+static int __init parisc_idle_init(void)
+{
+	const char *marker;
+
+	/* check QEMU/SeaBIOS marker in PAGE0 */
+	marker = (char *) &PAGE0->pad0;
+	running_on_qemu = (memcmp(marker, "SeaBIOS", 8) == 0);
+
+	if (!running_on_qemu)
+		cpu_idle_poll_ctrl(1);
+
+	return 0;
+}
+arch_initcall(parisc_idle_init);
+
 /*
  * Copy architecture-specific thread state
  */

From fee4380f368e84ed216b62ccd2fbc4126f2bf40b Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Mon, 18 Dec 2017 11:32:45 +0100
Subject: [PATCH 294/305] mtd: nand: pxa3xx: Fix READOOB implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the current driver, OOB bytes are accessed in raw mode, and when a
page access is done with NDCR_SPARE_EN set and NDCR_ECC_EN cleared, the
driver must read the whole spare area (64 bytes in case of a 2k page,
16 bytes for a 512 page). The driver was only reading the free OOB
bytes, which was leaving some unread data in the FIFO and was somehow
leading to a timeout.

We could patch the driver to read ->spare_size + ->ecc_size instead of
just ->spare_size when READOOB is requested, but we'd better make
in-band and OOB accesses consistent.
Since the driver is always accessing in-band data in non-raw mode (with
the ECC engine enabled), we should also access OOB data in this mode.
That's particularly useful when using the BCH engine because in this
mode the free OOB bytes are also ECC protected.

Fixes: 43bcfd2bb24a ("mtd: nand: pxa3xx: Add driver-specific ECC BCH support")
Cc: stable@vger.kernel.org
Reported-by: Sean Nyekjær <sean.nyekjaer@prevas.dk>
Tested-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Tested-by: Sean Nyekjaer <sean.nyekjaer@prevas.dk>
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/pxa3xx_nand.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c
index 90b9a9ccbe60..9285f60e5783 100644
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -963,6 +963,7 @@ static void prepare_start_command(struct pxa3xx_nand_info *info, int command)
 
 	switch (command) {
 	case NAND_CMD_READ0:
+	case NAND_CMD_READOOB:
 	case NAND_CMD_PAGEPROG:
 		info->use_ecc = 1;
 		break;

From 7b6af2c53192f1766892ef40c8f48a413509ed72 Mon Sep 17 00:00:00 2001
From: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Date: Wed, 3 Jan 2018 21:13:45 +0100
Subject: [PATCH 295/305] leds: core: Fix regression caused by commit
 2b83ff96f51d

Commit 2b83ff96f51d ("led: core: Fix brightness setting when setting delay_off=0")
replaced del_timer_sync(&led_cdev->blink_timer) with led_stop_software_blink()
in led_blink_set(), which additionally clears LED_BLINK_SW flag as well as
zeroes blink_delay_on and blink_delay_off properties of the struct led_classdev.

Cleansing of the latter ones wasn't required to fix the original issue but
wasn't considered harmful. It nonetheless turned out to be so in case when
pointer to one or both props is passed to led_blink_set() like in the
ledtrig-timer.c. In such cases zeroes are passed later in delay_on and/or
delay_off arguments to led_blink_setup(), which results either in stopping
the software blinking or setting blinking frequency always to 1Hz.

Avoid using led_stop_software_blink() and add a single call required
to clear LED_BLINK_SW flag, which was the only needed modification to
fix the original issue.

Fixes 2b83ff96f51d ("led: core: Fix brightness setting when setting delay_off=0")
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/led-core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index f3654fd2eaf3..ede4fa0ac2cc 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -186,8 +186,9 @@ void led_blink_set(struct led_classdev *led_cdev,
 		   unsigned long *delay_on,
 		   unsigned long *delay_off)
 {
-	led_stop_software_blink(led_cdev);
+	del_timer_sync(&led_cdev->blink_timer);
 
+	clear_bit(LED_BLINK_SW, &led_cdev->work_flags);
 	clear_bit(LED_BLINK_ONESHOT, &led_cdev->work_flags);
 	clear_bit(LED_BLINK_ONESHOT_STOP, &led_cdev->work_flags);
 

From b2cd1df66037e7c4697c7e40496bf7e4a5e16a2d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 7 Jan 2018 14:22:41 -0800
Subject: [PATCH 296/305] Linux 4.15-rc7

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index eb1f5973813e..eb59638035dd 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 15
 SUBLEVEL = 0
-EXTRAVERSION = -rc6
+EXTRAVERSION = -rc7
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*

From 98b8e4e5c17bf87c1b18ed929472051dab39878c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 3 Jan 2018 12:49:29 +0100
Subject: [PATCH 297/305] platform/x86: wmi: Call acpi_wmi_init() later

Calling acpi_wmi_init() at the subsys_initcall() level causes ordering
issues to appear on some systems and they are difficult to reproduce,
because there is no guaranteed ordering between subsys_initcall()
calls, so they may occur in different orders on different systems.

In particular, commit 86d9f48534e8 (mm/slab: fix kmemcg cache
creation delayed issue) exposed one of these issues where genl_init()
and acpi_wmi_init() are both called at the same initcall level, but
the former must run before the latter so as to avoid a NULL pointer
dereference.

For this reason, move the acpi_wmi_init() invocation to the
initcall_sync level which should still be early enough for things
to work correctly in the WMI land.

Link: https://marc.info/?t=151274596700002&r=1&w=2
Reported-by: Jonathan McDowell <noodles@earth.li>
Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Tested-by: Jonathan McDowell <noodles@earth.li>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/wmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 791449a2370f..daa68acbc900 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -1458,5 +1458,5 @@ static void __exit acpi_wmi_exit(void)
 	class_unregister(&wmi_bus_class);
 }
 
-subsys_initcall(acpi_wmi_init);
+subsys_initcall_sync(acpi_wmi_init);
 module_exit(acpi_wmi_exit);

From 7deea450eb912f269d999de62c8ab922d1461748 Mon Sep 17 00:00:00 2001
From: Sunil Challa <sunilkumar.challa@broadcom.com>
Date: Thu, 4 Jan 2018 18:46:54 -0500
Subject: [PATCH 298/305] bnxt_en: Fix population of flow_type in
 bnxt_hwrm_cfa_flow_alloc()

flow_type in HWRM_FLOW_ALLOC is not being populated correctly due to
incorrect passing of pointer and size of l3_mask argument of is_wildcard().
Fixed this.

Fixes: db1d36a27324 ("bnxt_en: add TC flower offload flow_alloc/free FW cmds")
Signed-off-by: Sunil Challa <sunilkumar.challa@broadcom.com>
Reviewed-by: Sathya Perla <sathya.perla@broadcom.com>
Reviewed-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 3d201d7324bd..d8fee26cd45e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -421,7 +421,7 @@ static int bnxt_hwrm_cfa_flow_alloc(struct bnxt *bp, struct bnxt_tc_flow *flow,
 	}
 
 	/* If all IP and L4 fields are wildcarded then this is an L2 flow */
-	if (is_wildcard(&l3_mask, sizeof(l3_mask)) &&
+	if (is_wildcard(l3_mask, sizeof(*l3_mask)) &&
 	    is_wildcard(&flow->l4_mask, sizeof(flow->l4_mask))) {
 		flow_flags |= CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_L2;
 	} else {

From 78f300049335ae81a5cc6b4b232481dc5e1f9d41 Mon Sep 17 00:00:00 2001
From: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Date: Thu, 4 Jan 2018 18:46:55 -0500
Subject: [PATCH 299/305] bnxt_en: Fix the 'Invalid VF' id check in
 bnxt_vf_ndo_prep routine.

In bnxt_vf_ndo_prep (which is called by bnxt_get_vf_config ndo), there is a
check for "Invalid VF id". Currently, the check is done against max_vfs.
However, the user doesn't always create max_vfs. So, the check should be
against the created number of VFs. The number of bnxt_vf_info structures
that are allocated in bnxt_alloc_vf_resources routine is the "number of
requested VFs". So, if an "invalid VF id" falls between the requested
number of VFs and the max_vfs, the driver will be dereferencing an invalid
pointer.

Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.")
Signed-off-by: Venkat Devvuru <venkatkumar.duvvuru@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 5ee18660bc33..c9617675f934 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -70,7 +70,7 @@ static int bnxt_vf_ndo_prep(struct bnxt *bp, int vf_id)
 		netdev_err(bp->dev, "vf ndo called though sriov is disabled\n");
 		return -EINVAL;
 	}
-	if (vf_id >= bp->pf.max_vfs) {
+	if (vf_id >= bp->pf.active_vfs) {
 		netdev_err(bp->dev, "Invalid VF id %d\n", vf_id);
 		return -EINVAL;
 	}

From b707fda2df4070785d0fa8a278aa13944c5f51f8 Mon Sep 17 00:00:00 2001
From: Eduardo Otubo <otubo@redhat.com>
Date: Fri, 5 Jan 2018 09:42:16 +0100
Subject: [PATCH 300/305] xen-netfront: enable device after manual module load

When loading the module after unloading it, the network interface would
not be enabled and thus wouldn't have a backend counterpart and unable
to be used by the guest.

The guest would face errors like:

  [root@guest ~]# ethtool -i eth0
  Cannot get driver information: No such device

  [root@guest ~]# ifconfig eth0
  eth0: error fetching interface information: Device not found

This patch initializes the state of the netfront device whenever it is
loaded manually, this state would communicate the netback to create its
device and establish the connection between them.

Signed-off-by: Eduardo Otubo <otubo@redhat.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netfront.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index c5a34671abda..9bd7ddeeb6a5 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1326,6 +1326,7 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev)
 
 	netif_carrier_off(netdev);
 
+	xenbus_switch_state(dev, XenbusStateInitialising);
 	return netdev;
 
  exit:

From cc35c3d1edf7a8373a1a5daa80a912dec96a9cd5 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 5 Jan 2018 11:17:17 -0200
Subject: [PATCH 301/305] sctp: do not retransmit upon FragNeeded if PMTU
 discovery is disabled

Currently, if PMTU discovery is disabled on a given transport, but the
configured value is higher than the actual PMTU, it is likely that we
will get some icmp Frag Needed. The issue is, if PMTU discovery is
disabled, we won't update the information and will issue a
retransmission immediately, which may very well trigger another ICMP,
and another retransmission, leading to a loop.

The fix is to simply not trigger immediate retransmissions if PMTU
discovery is disabled on the given transport.

Changes from v2:
- updated stale comment, noticed by Xin Long

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/input.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 621b5ca3fd1c..9320661cc41d 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -399,20 +399,20 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
 		return;
 	}
 
-	if (t->param_flags & SPP_PMTUD_ENABLE) {
-		/* Update transports view of the MTU */
-		sctp_transport_update_pmtu(t, pmtu);
+	if (!(t->param_flags & SPP_PMTUD_ENABLE))
+		/* We can't allow retransmitting in such case, as the
+		 * retransmission would be sized just as before, and thus we
+		 * would get another icmp, and retransmit again.
+		 */
+		return;
 
-		/* Update association pmtu. */
-		sctp_assoc_sync_pmtu(asoc);
-	}
+	/* Update transports view of the MTU */
+	sctp_transport_update_pmtu(t, pmtu);
 
-	/* Retransmit with the new pmtu setting.
-	 * Normally, if PMTU discovery is disabled, an ICMP Fragmentation
-	 * Needed will never be sent, but if a message was sent before
-	 * PMTU discovery was disabled that was larger than the PMTU, it
-	 * would not be fragmented, so it must be re-transmitted fragmented.
-	 */
+	/* Update association pmtu. */
+	sctp_assoc_sync_pmtu(asoc);
+
+	/* Retransmit with the new pmtu setting. */
 	sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
 }
 

From b6c5734db07079c9410147b32407f2366d584e6c Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 5 Jan 2018 11:17:18 -0200
Subject: [PATCH 302/305] sctp: fix the handling of ICMP Frag Needed for too
 small MTUs

syzbot reported a hang involving SCTP, on which it kept flooding dmesg
with the message:
[  246.742374] sctp: sctp_transport_update_pmtu: Reported pmtu 508 too
low, using default minimum of 512

That happened because whenever SCTP hits an ICMP Frag Needed, it tries
to adjust to the new MTU and triggers an immediate retransmission. But
it didn't consider the fact that MTUs smaller than the SCTP minimum MTU
allowed (512) would not cause the PMTU to change, and issued the
retransmission anyway (thus leading to another ICMP Frag Needed, and so
on).

As IPv4 (ip_rt_min_pmtu=556) and IPv6 (IPV6_MIN_MTU=1280) minimum MTU
are higher than that, sctp_transport_update_pmtu() is changed to
re-fetch the PMTU that got set after our request, and with that, detect
if there was an actual change or not.

The fix, thus, skips the immediate retransmission if the received ICMP
resulted in no change, in the hope that SCTP will select another path.

Note: The value being used for the minimum MTU (512,
SCTP_DEFAULT_MINSEGMENT) is not right and instead it should be (576,
SCTP_MIN_PMTU), but such change belongs to another patch.

Changes from v1:
- do not disable PMTU discovery, in the light of commit
06ad391919b2 ("[SCTP] Don't disable PMTU discovery when mtu is small")
and as suggested by Xin Long.
- changed the way to break the rtx loop by detecting if the icmp
  resulted in a change or not
Changes from v2:
none

See-also: https://lkml.org/lkml/2017/12/22/811
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  2 +-
 net/sctp/input.c           |  8 ++++++--
 net/sctp/transport.c       | 29 +++++++++++++++++++----------
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2f8f93da5dc2..9a5ccf03a59b 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -966,7 +966,7 @@ void sctp_transport_burst_limited(struct sctp_transport *);
 void sctp_transport_burst_reset(struct sctp_transport *);
 unsigned long sctp_transport_timeout(struct sctp_transport *);
 void sctp_transport_reset(struct sctp_transport *t);
-void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu);
+bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu);
 void sctp_transport_immediate_rtx(struct sctp_transport *);
 void sctp_transport_dst_release(struct sctp_transport *t);
 void sctp_transport_dst_confirm(struct sctp_transport *t);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 9320661cc41d..141c9c466ec1 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -406,8 +406,12 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
 		 */
 		return;
 
-	/* Update transports view of the MTU */
-	sctp_transport_update_pmtu(t, pmtu);
+	/* Update transports view of the MTU. Return if no update was needed.
+	 * If an update wasn't needed/possible, it also doesn't make sense to
+	 * try to retransmit now.
+	 */
+	if (!sctp_transport_update_pmtu(t, pmtu))
+		return;
 
 	/* Update association pmtu. */
 	sctp_assoc_sync_pmtu(asoc);
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 1e5a22430cf5..47f82bd794d9 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -248,28 +248,37 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 }
 
-void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
+bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
 {
 	struct dst_entry *dst = sctp_transport_dst_check(t);
+	bool change = true;
 
 	if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
-		pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
-			__func__, pmtu, SCTP_DEFAULT_MINSEGMENT);
-		/* Use default minimum segment size and disable
-		 * pmtu discovery on this transport.
-		 */
-		t->pathmtu = SCTP_DEFAULT_MINSEGMENT;
-	} else {
-		t->pathmtu = pmtu;
+		pr_warn_ratelimited("%s: Reported pmtu %d too low, using default minimum of %d\n",
+				    __func__, pmtu, SCTP_DEFAULT_MINSEGMENT);
+		/* Use default minimum segment instead */
+		pmtu = SCTP_DEFAULT_MINSEGMENT;
 	}
+	pmtu = SCTP_TRUNC4(pmtu);
 
 	if (dst) {
 		dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu);
 		dst = sctp_transport_dst_check(t);
 	}
 
-	if (!dst)
+	if (!dst) {
 		t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk);
+		dst = t->dst;
+	}
+
+	if (dst) {
+		/* Re-fetch, as under layers may have a higher minimum size */
+		pmtu = SCTP_TRUNC4(dst_mtu(dst));
+		change = t->pathmtu != pmtu;
+	}
+	t->pathmtu = pmtu;
+
+	return change;
 }
 
 /* Caches the dst entry and source address for a transport's destination

From 46cd75036415d94e9cf451e6606a099945d54cc6 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Fri, 5 Jan 2018 11:23:45 -0600
Subject: [PATCH 303/305] phylink: mark expected switch fall-throughs in
 phylink_mii_ioctl

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Addresses-Coverity-ID: 1463447 ("Missing break in switch")
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 150cd95a6e1e..249ce5cbea22 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1296,6 +1296,7 @@ int phylink_mii_ioctl(struct phylink *pl, struct ifreq *ifr, int cmd)
 		switch (cmd) {
 		case SIOCGMIIPHY:
 			mii->phy_id = pl->phydev->mdio.addr;
+			/* fall through */
 
 		case SIOCGMIIREG:
 			ret = phylink_phy_read(pl, mii->phy_id, mii->reg_num);
@@ -1318,6 +1319,7 @@ int phylink_mii_ioctl(struct phylink *pl, struct ifreq *ifr, int cmd)
 		switch (cmd) {
 		case SIOCGMIIPHY:
 			mii->phy_id = 0;
+			/* fall through */
 
 		case SIOCGMIIREG:
 			ret = phylink_mii_read(pl, mii->phy_id, mii->reg_num);

From 56c0290202ab94a2f2780c449395d4ae8495fab4 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 6 Jan 2018 09:00:09 +0100
Subject: [PATCH 304/305] mdio-sun4i: Fix a memory leak

If the probing of the regulator is deferred, the memory allocated by
'mdiobus_alloc_size()' will be leaking.
It should be freed before the next call to 'sun4i_mdio_probe()' which will
reallocate it.

Fixes: 4bdcb1dd9feb ("net: Add MDIO bus driver for the Allwinner EMAC")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-sun4i.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/mdio-sun4i.c b/drivers/net/phy/mdio-sun4i.c
index 135296508a7e..6425ce04d3f9 100644
--- a/drivers/net/phy/mdio-sun4i.c
+++ b/drivers/net/phy/mdio-sun4i.c
@@ -118,8 +118,10 @@ static int sun4i_mdio_probe(struct platform_device *pdev)
 
 	data->regulator = devm_regulator_get(&pdev->dev, "phy");
 	if (IS_ERR(data->regulator)) {
-		if (PTR_ERR(data->regulator) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
+		if (PTR_ERR(data->regulator) == -EPROBE_DEFER) {
+			ret = -EPROBE_DEFER;
+			goto err_out_free_mdiobus;
+		}
 
 		dev_info(&pdev->dev, "no regulator found\n");
 		data->regulator = NULL;

From 50f3d740d376f664f6accc7e86c9afd8f1c7e1e4 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sun, 7 Jan 2018 00:26:47 +0300
Subject: [PATCH 305/305] sh_eth: fix TXALCR1 offsets

The  TXALCR1 offsets are incorrect in the register offset tables, most
probably due to copy&paste error.  Luckily, the driver never uses this
register. :-)

Fixes: 4a55530f38e4 ("net: sh_eth: modify the definitions of register")
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index f21c1db91c3f..b9e2846589f8 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -147,7 +147,7 @@ static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = {
 	[FWNLCR0]	= 0x0090,
 	[FWALCR0]	= 0x0094,
 	[TXNLCR1]	= 0x00a0,
-	[TXALCR1]	= 0x00a0,
+	[TXALCR1]	= 0x00a4,
 	[RXNLCR1]	= 0x00a8,
 	[RXALCR1]	= 0x00ac,
 	[FWNLCR1]	= 0x00b0,
@@ -399,7 +399,7 @@ static const u16 sh_eth_offset_fast_sh3_sh2[SH_ETH_MAX_REGISTER_OFFSET] = {
 	[FWNLCR0]	= 0x0090,
 	[FWALCR0]	= 0x0094,
 	[TXNLCR1]	= 0x00a0,
-	[TXALCR1]	= 0x00a0,
+	[TXALCR1]	= 0x00a4,
 	[RXNLCR1]	= 0x00a8,
 	[RXALCR1]	= 0x00ac,
 	[FWNLCR1]	= 0x00b0,