Merge tag 's390-5.20-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux

Pull s390 updates from Alexander Gordeev: - Rework copy_oldmem_page() callback to take an iov_iter. This includes a few prerequisite updates and fixes to the oldmem reading code. - Rework cpufeature implementation to allow for various CPU feature indications, which is not only limited to hardware capabilities, but also allows CPU facilities. - Use the cpufeature rework to autoload Ultravisor module when CPU facility 158 is available. - Add ELF note type for encrypted CPU state of a protected virtual CPU. The zgetdump tool from s390-tools package will decrypt the CPU state using a Customer Communication Key and overwrite respective notes to make the data accessible for crash and other debugging tools. - Use vzalloc() instead of vmalloc() + memset() in ChaCha20 crypto test. - Fix incorrect recovery of kretprobe modified return address in stacktrace. - Switch the NMI handler to use generic irqentry_nmi_enter() and irqentry_nmi_exit() helper functions. - Rework the cryptographic Adjunct Processors (AP) pass-through design to support dynamic changes to the AP matrix of a running guest as well as to implement more of the AP architecture. - Minor boot code cleanups. - Grammar and typo fixes to hmcdrv and tape drivers. * tag 's390-5.20-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (46 commits) Revert "s390/smp: enforce lowcore protection on CPU restart" Revert "s390/smp: rework absolute lowcore access" Revert "s390/smp,ptdump: add absolute lowcore markers" s390/unwind: fix fgraph return address recovery s390/nmi: use irqentry_nmi_enter()/irqentry_nmi_exit() s390: add ELF note type for encrypted CPU state of a PV VCPU s390/smp,ptdump: add absolute lowcore markers s390/smp: rework absolute lowcore access s390/setup: rearrange absolute lowcore initialization s390/boot: cleanup adjust_to_uv_max() function s390/smp: enforce lowcore protection on CPU restart s390/tape: fix comment typo s390/hmcdrv: fix Kconfig "its" grammar s390/docs: fix warnings for vfio_ap driver doc s390/docs: fix warnings for vfio_ap driver lock usage doc s390/crash: support multi-segment iterators s390/crash: use static swap buffer for copy_to_user_real() s390/crash: move copy_to_user_real() to crash_dump.c s390/zcore: fix race when reading from hardware system area s390/crash: fix incorrect number of bytes to copy to user space ...
2022-08-06 17:05:21 -07:00
parent d77771c926 953503751a
commit 24cb958695
43 changed files with 1848 additions and 827 deletions
--- a/Documentation/s390/index.rst
+++ b/Documentation/s390/index.rst
@@ -12,6 +12,7 @@ s390 Architecture
    qeth
    s390dbf
    vfio-ap
+    vfio-ap-locking
    vfio-ccw
    zfcpdump
    common_io
--- a/Documentation/s390/vfio-ap-locking.rst
+++ b/Documentation/s390/vfio-ap-locking.rst
@@ -0,0 +1,115 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+VFIO AP Locks Overview
+======================
+This document describes the locks that are pertinent to the secure operation
+of the vfio_ap device driver. Throughout this document, the following variables
+will be used to denote instances of the structures herein described:
+
+.. code-block:: c
+
+  struct ap_matrix_dev *matrix_dev;
+  struct ap_matrix_mdev *matrix_mdev;
+  struct kvm *kvm;
+
+The Matrix Devices Lock (drivers/s390/crypto/vfio_ap_private.h)
+---------------------------------------------------------------
+
+.. code-block:: c
+
+  struct ap_matrix_dev {
+  	...
+  	struct list_head mdev_list;
+  	struct mutex mdevs_lock;
+  	...
+  }
+
+The Matrix Devices Lock (matrix_dev->mdevs_lock) is implemented as a global
+mutex contained within the single object of struct ap_matrix_dev. This lock
+controls access to all fields contained within each matrix_mdev
+(matrix_dev->mdev_list). This lock must be held while reading from, writing to
+or using the data from a field contained within a matrix_mdev instance
+representing one of the vfio_ap device driver's mediated devices.
+
+The KVM Lock (include/linux/kvm_host.h)
+---------------------------------------
+
+.. code-block:: c
+
+  struct kvm {
+  	...
+  	struct mutex lock;
+  	...
+  }
+
+The KVM Lock (kvm->lock) controls access to the state data for a KVM guest. This
+lock must be held by the vfio_ap device driver while one or more AP adapters,
+domains or control domains are being plugged into or unplugged from the guest.
+
+The KVM pointer is stored in the in the matrix_mdev instance
+(matrix_mdev->kvm = kvm) containing the state of the mediated device that has
+been attached to the KVM guest.
+
+The Guests Lock (drivers/s390/crypto/vfio_ap_private.h)
+-----------------------------------------------------------
+
+.. code-block:: c
+
+  struct ap_matrix_dev {
+  	...
+  	struct list_head mdev_list;
+  	struct mutex guests_lock;
+  	...
+  }
+
+The Guests Lock (matrix_dev->guests_lock) controls access to the
+matrix_mdev instances (matrix_dev->mdev_list) that represent mediated devices
+that hold the state for the mediated devices that have been attached to a
+KVM guest. This lock must be held:
+
+1. To control access to the KVM pointer (matrix_mdev->kvm) while the vfio_ap
+   device driver is using it to plug/unplug AP devices passed through to the KVM
+   guest.
+
+2. To add matrix_mdev instances to or remove them from matrix_dev->mdev_list.
+   This is necessary to ensure the proper locking order when the list is perused
+   to find an ap_matrix_mdev instance for the purpose of plugging/unplugging
+   AP devices passed through to a KVM guest.
+
+   For example, when a queue device is removed from the vfio_ap device driver,
+   if the adapter is passed through to a KVM guest, it will have to be
+   unplugged. In order to figure out whether the adapter is passed through,
+   the matrix_mdev object to which the queue is assigned will have to be
+   found. The KVM pointer (matrix_mdev->kvm) can then be used to determine if
+   the mediated device is passed through (matrix_mdev->kvm != NULL) and if so,
+   to unplug the adapter.
+
+It is not necessary to take the Guests Lock to access the KVM pointer if the
+pointer is not used to plug/unplug devices passed through to the KVM guest;
+however, in this case, the Matrix Devices Lock (matrix_dev->mdevs_lock) must be
+held in order to access the KVM pointer since it is set and cleared under the
+protection of the Matrix Devices Lock. A case in point is the function that
+handles interception of the PQAP(AQIC) instruction sub-function. This handler
+needs to access the KVM pointer only for the purposes of setting or clearing IRQ
+resources, so only the matrix_dev->mdevs_lock needs to be held.
+
+The PQAP Hook Lock (arch/s390/include/asm/kvm_host.h)
+-----------------------------------------------------
+
+.. code-block:: c
+
+  typedef int (*crypto_hook)(struct kvm_vcpu *vcpu);
+
+  struct kvm_s390_crypto {
+  	...
+  	struct rw_semaphore pqap_hook_rwsem;
+  	crypto_hook *pqap_hook;
+  	...
+  };
+
+The PQAP Hook Lock is a r/w semaphore that controls access to the function
+pointer of the handler ``(*kvm->arch.crypto.pqap_hook)`` to invoke when the
+PQAP(AQIC) instruction sub-function is intercepted by the host. The lock must be
+held in write mode when pqap_hook value is set, and in read mode when the
+pqap_hook function is called.
--- a/Documentation/s390/vfio-ap.rst
+++ b/Documentation/s390/vfio-ap.rst
@@ -123,27 +123,24 @@ Let's now take a look at how AP instructions executed on a guest are interpreted
 by the hardware.

 A satellite control block called the Crypto Control Block (CRYCB) is attached to
-our main hardware virtualization control block. The CRYCB contains three fields
-to identify the adapters, usage domains and control domains assigned to the KVM
-guest:
+our main hardware virtualization control block. The CRYCB contains an AP Control
+Block (APCB) that has three fields to identify the adapters, usage domains and
+control domains assigned to the KVM guest:

 * The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned
-  to the KVM guest. Each bit in the mask, from left to right (i.e. from most
-  significant to least significant bit in big endian order), corresponds to
+  to the KVM guest. Each bit in the mask, from left to right, corresponds to
  an APID from 0-255. If a bit is set, the corresponding adapter is valid for
  use by the KVM guest.

 * The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains
-  assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from
-  most significant to least significant bit in big endian order), corresponds to
-  an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue
-  is valid for use by the KVM guest.
+  assigned to the KVM guest. Each bit in the mask, from left to right,
+  corresponds to an AP queue index (APQI) from 0-255. If a bit is set, the
+  corresponding queue is valid for use by the KVM guest.

 * The AP Domain Mask field is a bit mask that identifies the AP control domains
  assigned to the KVM guest. The ADM bit mask controls which domains can be
  changed by an AP command-request message sent to a usage domain from the
-  guest. Each bit in the mask, from left to right (i.e. from most significant to
-  least significant bit in big endian order), corresponds to a domain from
+  guest. Each bit in the mask, from left to right, corresponds to a domain from
  0-255. If a bit is set, the corresponding domain can be modified by an AP
  command-request message sent to a usage domain.

@@ -151,10 +148,10 @@ If you recall from the description of an AP Queue, AP instructions include
 an APQN to identify the AP queue to which an AP command-request message is to be
 sent (NQAP and PQAP instructions), or from which a command-reply message is to
 be received (DQAP instruction). The validity of an APQN is defined by the matrix
-calculated from the APM and AQM; it is the cross product of all assigned adapter
-numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1
-and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6),
-(2,5) and (2,6) will be valid for the guest.
+calculated from the APM and AQM; it is the Cartesian product of all assigned
+adapter numbers (APM) with all assigned queue indexes (AQM). For example, if
+adapters 1 and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs
+(1,5), (1,6), (2,5) and (2,6) will be valid for the guest.

 The APQNs can provide secure key functionality - i.e., a private key is stored
 on the adapter card for each of its domains - so each APQN must be assigned to
@@ -192,7 +189,7 @@ The design introduces three new objects:

 1. AP matrix device
 2. VFIO AP device driver (vfio_ap.ko)
-3. VFIO AP mediated matrix pass-through device
+3. VFIO AP mediated pass-through device

 The VFIO AP device driver
 -------------------------
@@ -200,12 +197,13 @@ The VFIO AP (vfio_ap) device driver serves the following purposes:

 1. Provides the interfaces to secure APQNs for exclusive use of KVM guests.

-2. Sets up the VFIO mediated device interfaces to manage a mediated matrix
+2. Sets up the VFIO mediated device interfaces to manage a vfio_ap mediated
   device and creates the sysfs interfaces for assigning adapters, usage
   domains, and control domains comprising the matrix for a KVM guest.

-3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's
-   SIE state description to grant the guest access to a matrix of AP devices
+3. Configures the APM, AQM and ADM in the APCB contained in the CRYCB referenced
+   by a KVM guest's SIE state description to grant the guest access to a matrix
+   of AP devices

 Reserve APQNs for exclusive use of KVM guests
 ---------------------------------------------
@@ -235,10 +233,10 @@ reserved::
  |                  |       8 probe        |                  |
  +--------^---------+                      +--^--^------------+
  6 edit   |                                   |  |
-    apmask |     +-----------------------------+  | 9 mdev create
+    apmask |     +-----------------------------+  | 11 mdev create
    aqmask |     |           1 modprobe           |
  +--------+-----+---+           +----------------+-+         +----------------+
-  |                  |           |                  |8 create |     mediated   |
+  |                  |           |                  |10 create|     mediated   |
  |      admin       |           | VFIO device core |--------->     matrix     |
  |                  +           |                  |         |     device     |
  +------+-+---------+           +--------^---------+         +--------^-------+
@@ -246,14 +244,14 @@ reserved::
 	 | | 9 create vfio_ap-passthrough |                            |
 	 | +------------------------------+                            |
 	 +-------------------------------------------------------------+
-		     10  assign adapter/domain/control domain
+		     12  assign adapter/domain/control domain

 The process for reserving an AP queue for use by a KVM guest is:

 1. The administrator loads the vfio_ap device driver
 2. The vfio-ap driver during its initialization will register a single 'matrix'
   device with the device core. This will serve as the parent device for
-   all mediated matrix devices used to configure an AP matrix for a guest.
+   all vfio_ap mediated devices used to configure an AP matrix for a guest.
 3. The /sys/devices/vfio_ap/matrix device is created by the device core
 4. The vfio_ap device driver will register with the AP bus for AP queue devices
   of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap
@@ -269,24 +267,24 @@ The process for reserving an AP queue for use by a KVM guest is:
   default zcrypt cex4queue driver.
 8. The AP bus probes the vfio_ap device driver to bind the queues reserved for
   it.
-9. The administrator creates a passthrough type mediated matrix device to be
+9. The administrator creates a passthrough type vfio_ap mediated device to be
   used by a guest
 10. The administrator assigns the adapters, usage domains and control domains
    to be exclusively used by a guest.

 Set up the VFIO mediated device interfaces
 ------------------------------------------
-The VFIO AP device driver utilizes the common interface of the VFIO mediated
+The VFIO AP device driver utilizes the common interfaces of the VFIO mediated
 device core driver to:

-* Register an AP mediated bus driver to add a mediated matrix device to and
+* Register an AP mediated bus driver to add a vfio_ap mediated device to and
  remove it from a VFIO group.
-* Create and destroy a mediated matrix device
-* Add a mediated matrix device to and remove it from the AP mediated bus driver
-* Add a mediated matrix device to and remove it from an IOMMU group
+* Create and destroy a vfio_ap mediated device
+* Add a vfio_ap mediated device to and remove it from the AP mediated bus driver
+* Add a vfio_ap mediated device to and remove it from an IOMMU group

 The following high-level block diagram shows the main components and interfaces
-of the VFIO AP mediated matrix device driver::
+of the VFIO AP mediated device driver::

   +-------------+
   |             |
@@ -343,7 +341,7 @@ matrix device.
 	* device_api:
 	    the mediated device type's API
 	* available_instances:
-	    the number of mediated matrix passthrough devices
+	    the number of vfio_ap mediated passthrough devices
 	    that can be created
 	* device_api:
 	    specifies the VFIO API
@@ -351,29 +349,37 @@ matrix device.
    This attribute group identifies the user-defined sysfs attributes of the
    mediated device. When a device is registered with the VFIO mediated device
    framework, the sysfs attribute files identified in the 'mdev_attr_groups'
-    structure will be created in the mediated matrix device's directory. The
-    sysfs attributes for a mediated matrix device are:
+    structure will be created in the vfio_ap mediated device's directory. The
+    sysfs attributes for a vfio_ap mediated device are:

    assign_adapter / unassign_adapter:
      Write-only attributes for assigning/unassigning an AP adapter to/from the
-      mediated matrix device. To assign/unassign an adapter, the APID of the
-      adapter is echoed to the respective attribute file.
+      vfio_ap mediated device. To assign/unassign an adapter, the APID of the
+      adapter is echoed into the respective attribute file.
    assign_domain / unassign_domain:
      Write-only attributes for assigning/unassigning an AP usage domain to/from
-      the mediated matrix device. To assign/unassign a domain, the domain
-      number of the usage domain is echoed to the respective attribute
+      the vfio_ap mediated device. To assign/unassign a domain, the domain
+      number of the usage domain is echoed into the respective attribute
      file.
    matrix:
-      A read-only file for displaying the APQNs derived from the cross product
-      of the adapter and domain numbers assigned to the mediated matrix device.
+      A read-only file for displaying the APQNs derived from the Cartesian
+      product of the adapter and domain numbers assigned to the vfio_ap mediated
+      device.
+    guest_matrix:
+      A read-only file for displaying the APQNs derived from the Cartesian
+      product of the adapter and domain numbers assigned to the APM and AQM
+      fields respectively of the KVM guest's CRYCB. This may differ from the
+      the APQNs assigned to the vfio_ap mediated device if any APQN does not
+      reference a queue device bound to the vfio_ap device driver (i.e., the
+      queue is not in the host's AP configuration).
    assign_control_domain / unassign_control_domain:
      Write-only attributes for assigning/unassigning an AP control domain
-      to/from the mediated matrix device. To assign/unassign a control domain,
-      the ID of the domain to be assigned/unassigned is echoed to the respective
-      attribute file.
+      to/from the vfio_ap mediated device. To assign/unassign a control domain,
+      the ID of the domain to be assigned/unassigned is echoed into the
+      respective attribute file.
    control_domains:
      A read-only file for displaying the control domain numbers assigned to the
-      mediated matrix device.
+      vfio_ap mediated device.

 * functions:

@@ -383,45 +389,75 @@ matrix device.
    * Store the reference to the KVM structure for the guest using the mdev
    * Store the AP matrix configuration for the adapters, domains, and control
      domains assigned via the corresponding sysfs attributes files
+    * Store the AP matrix configuration for the adapters, domains and control
+      domains available to a guest. A guest may not be provided access to APQNs
+      referencing queue devices that do not exist, or are not bound to the
+      vfio_ap device driver.

  remove:
-    deallocates the mediated matrix device's ap_matrix_mdev structure. This will
-    be allowed only if a running guest is not using the mdev.
+    deallocates the vfio_ap mediated device's ap_matrix_mdev structure.
+    This will be allowed only if a running guest is not using the mdev.

 * callback interfaces

-  open:
+  open_device:
    The vfio_ap driver uses this callback to register a
-    VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix
-    device. The open is invoked when QEMU connects the VFIO iommu group
-    for the mdev matrix device to the MDEV bus. Access to the KVM structure used
-    to configure the KVM guest is provided via this callback. The KVM structure,
-    is used to configure the guest's access to the AP matrix defined via the
-    mediated matrix device's sysfs attribute files.
-  release:
-    unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the
-    mdev matrix device and deconfigures the guest's AP matrix.
+    VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the matrix mdev
+    devices. The open_device callback is invoked by userspace to connect the
+    VFIO iommu group for the matrix mdev device to the MDEV bus. Access to the
+    KVM structure used to configure the KVM guest is provided via this callback.
+    The KVM structure, is used to configure the guest's access to the AP matrix
+    defined via the vfio_ap mediated device's sysfs attribute files.

-Configure the APM, AQM and ADM in the CRYCB
-------------------------------------------
-Configuring the AP matrix for a KVM guest will be performed when the
+  close_device:
+    unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the
+    matrix mdev device and deconfigures the guest's AP matrix.
+
+  ioctl:
+    this callback handles the VFIO_DEVICE_GET_INFO and VFIO_DEVICE_RESET ioctls
+    defined by the vfio framework.
+
+Configure the guest's AP resources
+----------------------------------
+Configuring the AP resources for a KVM guest will be performed when the
 VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier
-function is called when QEMU connects to KVM. The guest's AP matrix is
-configured via it's CRYCB by:
+function is called when userspace connects to KVM. The guest's AP resources are
+configured via it's APCB by:

 * Setting the bits in the APM corresponding to the APIDs assigned to the
-  mediated matrix device via its 'assign_adapter' interface.
+  vfio_ap mediated device via its 'assign_adapter' interface.
 * Setting the bits in the AQM corresponding to the domains assigned to the
-  mediated matrix device via its 'assign_domain' interface.
+  vfio_ap mediated device via its 'assign_domain' interface.
 * Setting the bits in the ADM corresponding to the domain dIDs assigned to the
-  mediated matrix device via its 'assign_control_domains' interface.
+  vfio_ap mediated device via its 'assign_control_domains' interface.
+
+The linux device model precludes passing a device through to a KVM guest that
+is not bound to the device driver facilitating its pass-through. Consequently,
+an APQN that does not reference a queue device bound to the vfio_ap device
+driver will not be assigned to a KVM guest's matrix. The AP architecture,
+however, does not provide a means to filter individual APQNs from the guest's
+matrix, so the adapters, domains and control domains assigned to vfio_ap
+mediated device via its sysfs 'assign_adapter', 'assign_domain' and
+'assign_control_domain' interfaces will be filtered before providing the AP
+configuration to a guest:
+
+* The APIDs of the adapters, the APQIs of the domains and the domain numbers of
+  the control domains assigned to the matrix mdev that are not also assigned to
+  the host's AP configuration will be filtered.
+
+* Each APQN derived from the Cartesian product of the APIDs and APQIs assigned
+  to the vfio_ap mdev is examined and if any one of them does not reference a
+  queue device bound to the vfio_ap device driver, the adapter will not be
+  plugged into the guest (i.e., the bit corresponding to its APID will not be
+  set in the APM of the guest's APCB).

 The CPU model features for AP
 -----------------------------
-The AP stack relies on the presence of the AP instructions as well as two
-facilities: The AP Facilities Test (APFT) facility; and the AP Query
-Configuration Information (QCI) facility. These features/facilities are made
-available to a KVM guest via the following CPU model features:
+The AP stack relies on the presence of the AP instructions as well as three
+facilities: The AP Facilities Test (APFT) facility; the AP Query
+Configuration Information (QCI) facility; and the AP Queue Interruption Control
+facility. These features/facilities are made available to a KVM guest via the
+following CPU model features:

 1. ap: Indicates whether the AP instructions are installed on the guest. This
   feature will be enabled by KVM only if the AP instructions are installed
@@ -435,24 +471,28 @@ available to a KVM guest via the following CPU model features:
   can be made available to the guest only if it is available on the host (i.e.,
   facility bit 12 is set).

+4. apqi: Indicates AP Queue Interruption Control faclity is available on the
+   guest. This facility can be made available to the guest only if it is
+   available on the host (i.e., facility bit 65 is set).
+
 Note: If the user chooses to specify a CPU model different than the 'host'
 model to QEMU, the CPU model features and facilities need to be turned on
 explicitly; for example::

-     /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on
+     /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on,apqi=on

 A guest can be precluded from using AP features/facilities by turning them off
 explicitly; for example::

-     /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off
+     /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off,apqi=off

 Note: If the APFT facility is turned off (apft=off) for the guest, the guest
-will not see any AP devices. The zcrypt device drivers that register for type 10
-and newer AP devices - i.e., the cex4card and cex4queue device drivers - need
-the APFT facility to ascertain the facilities installed on a given AP device. If
-the APFT facility is not installed on the guest, then the probe of device
-drivers will fail since only type 10 and newer devices can be configured for
-guest use.
+will not see any AP devices. The zcrypt device drivers on the guest that
+register for type 10 and newer AP devices - i.e., the cex4card and cex4queue
+device drivers - need the APFT facility to ascertain the facilities installed on
+a given AP device. If the APFT facility is not installed on the guest, then no
+adapter or domain devices will get created by the AP bus running on the
+guest because only type 10 and newer devices can be configured for guest use.

 Example
 =======
@@ -471,7 +511,7 @@ CARD.DOMAIN TYPE  MODE
 05.00ab     CEX5C CCA-Coproc
 06          CEX5A Accelerator
 06.0004     CEX5A Accelerator
-06.00ab     CEX5C CCA-Coproc
+06.00ab     CEX5A Accelerator
 =========== ===== ============

 Guest2
@@ -479,9 +519,9 @@ Guest2
 =========== ===== ============
 CARD.DOMAIN TYPE  MODE
 =========== ===== ============
-05          CEX5A Accelerator
-05.0047     CEX5A Accelerator
-05.00ff     CEX5A Accelerator
+05          CEX5C CCA-Coproc
+05.0047     CEX5C CCA-Coproc
+05.00ff     CEX5C CCA-Coproc
 =========== ===== ============

 Guest3
@@ -529,40 +569,56 @@ These are the steps:

 2. Secure the AP queues to be used by the three guests so that the host can not
   access them. To secure them, there are two sysfs files that specify
-   bitmasks marking a subset of the APQN range as 'usable by the default AP
-   queue device drivers' or 'not usable by the default device drivers' and thus
-   available for use by the vfio_ap device driver'. The location of the sysfs
-   files containing the masks are::
+   bitmasks marking a subset of the APQN range as usable only by the default AP
+   queue device drivers. All remaining APQNs are available for use by
+   any other device driver. The vfio_ap device driver is currently the only
+   non-default device driver. The location of the sysfs files containing the
+   masks are::

     /sys/bus/ap/apmask
     /sys/bus/ap/aqmask

   The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs
-   (APID). Each bit in the mask, from left to right (i.e., from most significant
-   to least significant bit in big endian order), corresponds to an APID from
-   0-255. If a bit is set, the APID is marked as usable only by the default AP
-   queue device drivers; otherwise, the APID is usable by the vfio_ap
-   device driver.
+   (APID). Each bit in the mask, from left to right, corresponds to an APID from
+   0-255. If a bit is set, the APID belongs to the subset of APQNs marked as
+   available only to the default AP queue device drivers.

   The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes
-   (APQI). Each bit in the mask, from left to right (i.e., from most significant
-   to least significant bit in big endian order), corresponds to an APQI from
-   0-255. If a bit is set, the APQI is marked as usable only by the default AP
-   queue device drivers; otherwise, the APQI is usable by the vfio_ap device
-   driver.
+   (APQI). Each bit in the mask, from left to right, corresponds to an APQI from
+   0-255. If a bit is set, the APQI belongs to the subset of APQNs marked as
+   available only to the default AP queue device drivers.

-   Take, for example, the following mask::
+   The Cartesian product of the APIDs corresponding to the bits set in the
+   apmask and the APQIs corresponding to the bits set in the aqmask comprise
+   the subset of APQNs that can be used only by the host default device drivers.
+   All other APQNs are available to the non-default device drivers such as the
+   vfio_ap driver.

-      0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+   Take, for example, the following masks::

-    It indicates:
+      apmask:
+      0x7d00000000000000000000000000000000000000000000000000000000000000

-      1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6
-      belong to the vfio_ap device driver's pool.
+      aqmask:
+      0x8000000000000000000000000000000000000000000000000000000000000000
+
+   The masks indicate:
+
+   * Adapters 1, 2, 3, 4, 5, and 7 are available for use by the host default
+     device drivers.
+
+   * Domain 0 is available for use by the host default device drivers
+
+   * The subset of APQNs available for use only by the default host device
+     drivers are:
+
+     (1,0), (2,0), (3,0), (4.0), (5,0) and (7,0)
+
+   * All other APQNs are available for use by the non-default device drivers.

   The APQN of each AP queue device assigned to the linux host is checked by the
-   AP bus against the set of APQNs derived from the cross product of APIDs
-   and APQIs marked as usable only by the default AP queue device drivers. If a
+   AP bus against the set of APQNs derived from the Cartesian product of APIDs
+   and APQIs marked as available to the default AP queue device drivers. If a
   match is detected,  only the default AP queue device drivers will be probed;
   otherwise, the vfio_ap device driver will be probed.

@@ -579,8 +635,7 @@ These are the steps:

 	   0x4100000000000000000000000000000000000000000000000000000000000000

-	Keep in mind that the mask reads from left to right (i.e., most
-	significant to least significant bit in big endian order), so the mask
+	Keep in mind that the mask reads from left to right, so the mask
 	above identifies device numbers 1 and 7 (01000001).

 	If the string is longer than the mask, the operation is terminated with
@@ -626,11 +681,22 @@ These are the steps:
 	    default drivers pool:    adapter 0-15, domain 1
 	    alternate drivers pool:  adapter 16-255, domains 0, 2-255

+   **Note:**
+   Changing a mask such that one or more APQNs will be taken from a vfio_ap
+   mediated device (see below) will fail with an error (EBUSY). A message
+   is logged to the kernel ring buffer which can be viewed with the 'dmesg'
+   command. The output identifies each APQN flagged as 'in use' and identifies
+   the vfio_ap mediated device to which it is assigned; for example:
+
+   Userspace may not re-assign queue 05.0054 already assigned to 62177883-f1bb-47f0-914d-32a22e3a8804
+   Userspace may not re-assign queue 04.0054 already assigned to cef03c3c-903d-4ecc-9a83-40694cb8aee4
+
 Securing the APQNs for our example
 ----------------------------------
   To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047,
   06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding
-   APQNs can either be removed from the default masks::
+   APQNs can be removed from the default masks using either of the following
+   commands::

      echo -5,-6 > /sys/bus/ap/apmask

@@ -683,7 +749,7 @@ Securing the APQNs for our example

     /sys/devices/vfio_ap/matrix/
     --- [mdev_supported_types]
-     ------ [vfio_ap-passthrough] (passthrough mediated matrix device type)
+     ------ [vfio_ap-passthrough] (passthrough vfio_ap mediated device type)
     --------- create
     --------- [devices]

@@ -734,6 +800,9 @@ Securing the APQNs for our example
     ----------------unassign_control_domain
     ----------------unassign_domain

+   Note *****: The vfio_ap mdevs do not persist across reboots unless the
+               mdevctl tool is used to create and persist them.
+
 4. The administrator now needs to configure the matrixes for the mediated
   devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3).

@@ -755,6 +824,10 @@ Securing the APQNs for our example

 	 cat matrix

+   To display the matrix that is or will be assigned to Guest1::
+
+	 cat guest_matrix
+
   This is how the matrix is configured for Guest2::

      echo 5 > assign_adapter
@@ -774,17 +847,24 @@ Securing the APQNs for our example
     higher than the maximum is specified, the operation will terminate with
     an error (ENODEV).

-   * All APQNs that can be derived from the adapter ID and the IDs of
-     the previously assigned domains must be bound to the vfio_ap device
-     driver. If no domains have yet been assigned, then there must be at least
-     one APQN with the specified APID bound to the vfio_ap driver. If no such
-     APQNs are bound to the driver, the operation will terminate with an
-     error (EADDRNOTAVAIL).
+     Note: The maximum adapter number can be obtained via the sysfs
+	   /sys/bus/ap/ap_max_adapter_id attribute file.

-     No APQN that can be derived from the adapter ID and the IDs of the
-     previously assigned domains can be assigned to another mediated matrix
-     device. If an APQN is assigned to another mediated matrix device, the
-     operation will terminate with an error (EADDRINUSE).
+   * Each APQN derived from the Cartesian product of the APID of the adapter
+     being assigned and the APQIs of the domains previously assigned:
+
+     - Must only be available to the vfio_ap device driver as specified in the
+       sysfs /sys/bus/ap/apmask and /sys/bus/ap/aqmask attribute files. If even
+       one APQN is reserved for use by the host device driver, the operation
+       will terminate with an error (EADDRNOTAVAIL).
+
+     - Must NOT be assigned to another vfio_ap mediated device. If even one APQN
+       is assigned to another vfio_ap mediated device, the operation will
+       terminate with an error (EBUSY).
+
+     - Must NOT be assigned while the sysfs /sys/bus/ap/apmask and
+       sys/bus/ap/aqmask attribute files are being edited or the operation may
+       terminate with an error (EBUSY).

   In order to successfully assign a domain:

@@ -793,41 +873,50 @@ Securing the APQNs for our example
     higher than the maximum is specified, the operation will terminate with
     an error (ENODEV).

-   * All APQNs that can be derived from the domain ID and the IDs of
-     the previously assigned adapters must be bound to the vfio_ap device
-     driver. If no domains have yet been assigned, then there must be at least
-     one APQN with the specified APQI bound to the vfio_ap driver. If no such
-     APQNs are bound to the driver, the operation will terminate with an
-     error (EADDRNOTAVAIL).
+     Note: The maximum domain number can be obtained via the sysfs
+	   /sys/bus/ap/ap_max_domain_id attribute file.

-     No APQN that can be derived from the domain ID and the IDs of the
-     previously assigned adapters can be assigned to another mediated matrix
-     device. If an APQN is assigned to another mediated matrix device, the
-     operation will terminate with an error (EADDRINUSE).
+    * Each APQN derived from the Cartesian product of the APQI of the domain
+      being assigned and the APIDs of the adapters previously assigned:

-   In order to successfully assign a control domain, the domain number
-   specified must represent a value from 0 up to the maximum domain number
-   configured for the system. If a control domain number higher than the maximum
-   is specified, the operation will terminate with an error (ENODEV).
+     - Must only be available to the vfio_ap device driver as specified in the
+       sysfs /sys/bus/ap/apmask and /sys/bus/ap/aqmask attribute files. If even
+       one APQN is reserved for use by the host device driver, the operation
+       will terminate with an error (EADDRNOTAVAIL).
+
+     - Must NOT be assigned to another vfio_ap mediated device. If even one APQN
+       is assigned to another vfio_ap mediated device, the operation will
+       terminate with an error (EBUSY).
+
+     - Must NOT be assigned while the sysfs /sys/bus/ap/apmask and
+       sys/bus/ap/aqmask attribute files are being edited or the operation may
+       terminate with an error (EBUSY).
+
+   In order to successfully assign a control domain:
+
+   * The domain number specified must represent a value from 0 up to the maximum
+     domain number configured for the system. If a control domain number higher
+     than the maximum is specified, the operation will terminate with an
+     error (ENODEV).

 5. Start Guest1::

-     /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+     /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on,apqi=on \
 	-device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ...

 7. Start Guest2::

-     /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+     /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on,apqi=on \
 	-device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ...

 7. Start Guest3::

-     /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+     /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on,apqi=on \
 	-device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ...

-When the guest is shut down, the mediated matrix devices may be removed.
+When the guest is shut down, the vfio_ap mediated devices may be removed.

-Using our example again, to remove the mediated matrix device $uuid1::
+Using our example again, to remove the vfio_ap mediated device $uuid1::

   /sys/devices/vfio_ap/matrix/
      --- [mdev_supported_types]
@@ -840,26 +929,143 @@ Using our example again, to remove the mediated matrix device $uuid1::

   echo 1 > remove

-This will remove all of the mdev matrix device's sysfs structures including
-the mdev device itself. To recreate and reconfigure the mdev matrix device,
+This will remove all of the matrix mdev device's sysfs structures including
+the mdev device itself. To recreate and reconfigure the matrix mdev device,
 all of the steps starting with step 3 will have to be performed again. Note
-that the remove will fail if a guest using the mdev is still running.
+that the remove will fail if a guest using the vfio_ap mdev is still running.

-It is not necessary to remove an mdev matrix device, but one may want to
+It is not necessary to remove a vfio_ap mdev, but one may want to
 remove it if no guest will use it during the remaining lifetime of the linux
-host. If the mdev matrix device is removed, one may want to also reconfigure
+host. If the vfio_ap mdev is removed, one may want to also reconfigure
 the pool of adapters and queues reserved for use by the default drivers.

+Hot plug/unplug support:
+========================
+An adapter, domain or control domain may be hot plugged into a running KVM
+guest by assigning it to the vfio_ap mediated device being used by the guest if
+the following conditions are met:
+
+* The adapter, domain or control domain must also be assigned to the host's
+  AP configuration.
+
+* Each APQN derived from the Cartesian product comprised of the APID of the
+  adapter being assigned and the APQIs of the domains assigned must reference a
+  queue device bound to the vfio_ap device driver.
+
+* To hot plug a domain, each APQN derived from the Cartesian product
+  comprised of the APQI of the domain being assigned and the APIDs of the
+  adapters assigned must reference a queue device bound to the vfio_ap device
+  driver.
+
+An adapter, domain or control domain may be hot unplugged from a running KVM
+guest by unassigning it from the vfio_ap mediated device being used by the
+guest.
+
+Over-provisioning of AP queues for a KVM guest:
+===============================================
+Over-provisioning is defined herein as the assignment of adapters or domains to
+a vfio_ap mediated device that do not reference AP devices in the host's AP
+configuration. The idea here is that when the adapter or domain becomes
+available, it will be automatically hot-plugged into the KVM guest using
+the vfio_ap mediated device to which it is assigned as long as each new APQN
+resulting from plugging it in references a queue device bound to the vfio_ap
+device driver.
+
 Limitations
 ===========
-* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN
-  to the default drivers pool of a queue that is still assigned to a mediated
-  device in use by a guest. It is incumbent upon the administrator to
-  ensure there is no mediated device in use by a guest to which the APQN is
-  assigned lest the host be given access to the private data of the AP queue
-  device such as a private key configured specifically for the guest.
+Live guest migration is not supported for guests using AP devices without
+intervention by a system administrator. Before a KVM guest can be migrated,
+the vfio_ap mediated device must be removed. Unfortunately, it can not be
+removed manually (i.e., echo 1 > /sys/devices/vfio_ap/matrix/$UUID/remove) while
+the mdev is in use by a KVM guest. If the guest is being emulated by QEMU,
+its mdev can be hot unplugged from the guest in one of two ways:

-* Dynamically modifying the AP matrix for a running guest (which would amount to
-  hot(un)plug of AP devices for the guest) is currently not supported
+1. If the KVM guest was started with libvirt, you can hot unplug the mdev via
+   the following commands:

-* Live guest migration is not supported for guests using AP devices.
+      virsh detach-device <guestname> <path-to-device-xml>
+
+      For example, to hot unplug mdev 62177883-f1bb-47f0-914d-32a22e3a8804 from
+      the guest named 'my-guest':
+
+         virsh detach-device my-guest ~/config/my-guest-hostdev.xml
+
+            The contents of my-guest-hostdev.xml:
+
+.. code-block:: xml
+
+            <hostdev mode='subsystem' type='mdev' managed='no' model='vfio-ap'>
+              <source>
+                <address uuid='62177883-f1bb-47f0-914d-32a22e3a8804'/>
+              </source>
+            </hostdev>
+
+
+      virsh qemu-monitor-command <guest-name> --hmp "device-del <device-id>"
+
+      For example, to hot unplug the vfio_ap mediated device identified on the
+      qemu command line with 'id=hostdev0' from the guest named 'my-guest':
+
+.. code-block:: sh
+
+         virsh qemu-monitor-command my-guest --hmp "device_del hostdev0"
+
+2. A vfio_ap mediated device can be hot unplugged by attaching the qemu monitor
+   to the guest and using the following qemu monitor command:
+
+      (QEMU) device-del id=<device-id>
+
+      For example, to hot unplug the vfio_ap mediated device that was specified
+      on the qemu command line with 'id=hostdev0' when the guest was started:
+
+         (QEMU) device-del id=hostdev0
+
+After live migration of the KVM guest completes, an AP configuration can be
+restored to the KVM guest by hot plugging a vfio_ap mediated device on the target
+system into the guest in one of two ways:
+
+1. If the KVM guest was started with libvirt, you can hot plug a matrix mediated
+   device into the guest via the following virsh commands:
+
+   virsh attach-device <guestname> <path-to-device-xml>
+
+      For example, to hot plug mdev 62177883-f1bb-47f0-914d-32a22e3a8804 into
+      the guest named 'my-guest':
+
+         virsh attach-device my-guest ~/config/my-guest-hostdev.xml
+
+            The contents of my-guest-hostdev.xml:
+
+.. code-block:: xml
+
+            <hostdev mode='subsystem' type='mdev' managed='no' model='vfio-ap'>
+              <source>
+                <address uuid='62177883-f1bb-47f0-914d-32a22e3a8804'/>
+              </source>
+            </hostdev>
+
+
+   virsh qemu-monitor-command <guest-name> --hmp \
+   "device_add vfio-ap,sysfsdev=<path-to-mdev>,id=<device-id>"
+
+      For example, to hot plug the vfio_ap mediated device
+      62177883-f1bb-47f0-914d-32a22e3a8804 into the guest named 'my-guest' with
+      device-id hostdev0:
+
+      virsh qemu-monitor-command my-guest --hmp \
+      "device_add vfio-ap,\
+      sysfsdev=/sys/devices/vfio_ap/matrix/62177883-f1bb-47f0-914d-32a22e3a8804,\
+      id=hostdev0"
+
+2. A vfio_ap mediated device can be hot plugged by attaching the qemu monitor
+   to the guest and using the following qemu monitor command:
+
+      (qemu) device_add "vfio-ap,sysfsdev=<path-to-mdev>,id=<device-id>"
+
+      For example, to plug the vfio_ap mediated device
+      62177883-f1bb-47f0-914d-32a22e3a8804 into the guest with the device-id
+      hostdev0:
+
+         (QEMU) device-add "vfio-ap,\
+         sysfsdev=/sys/devices/vfio_ap/matrix/62177883-f1bb-47f0-914d-32a22e3a8804,\
+         id=hostdev0"
--- a/2
+++ b/2
@@ -17808,7 +17808,7 @@ M:	Jason Herne <jjherne@linux.ibm.com>
 L:	linux-s390@vger.kernel.org
 S:	Supported
 W:	http://www.ibm.com/developerworks/linux/linux390/
-F:	Documentation/s390/vfio-ap.rst
+F:	Documentation/s390/vfio-ap*
 F:	drivers/s390/crypto/vfio_ap*

 S390 VFIO-CCW DRIVER
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -152,6 +152,7 @@ static void setup_kernel_memory_layout(void)
 	unsigned long vmemmap_start;
 	unsigned long rte_size;
 	unsigned long pages;
+	unsigned long vmax;

 	pages = ident_map_size / PAGE_SIZE;
 	/* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */
@@ -163,10 +164,10 @@ static void setup_kernel_memory_layout(void)
 	    vmalloc_size > _REGION2_SIZE ||
 	    vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN >
 		    _REGION2_SIZE) {
-		MODULES_END = _REGION1_SIZE;
+		vmax = _REGION1_SIZE;
 		rte_size = _REGION2_SIZE;
 	} else {
-		MODULES_END = _REGION2_SIZE;
+		vmax = _REGION2_SIZE;
 		rte_size = _REGION3_SIZE;
 	}
 	/*
@@ -174,11 +175,12 @@ static void setup_kernel_memory_layout(void)
 	 * secure storage limit, so that any vmalloc allocation
 	 * we do could be used to back secure guest storage.
 	 */
-	adjust_to_uv_max(&MODULES_END);
+	vmax = adjust_to_uv_max(vmax);
 #ifdef CONFIG_KASAN
 	/* force vmalloc and modules below kasan shadow */
-	MODULES_END = min(MODULES_END, KASAN_SHADOW_START);
+	vmax = min(vmax, KASAN_SHADOW_START);
 #endif
+	MODULES_END = vmax;
 	MODULES_VADDR = MODULES_END - MODULES_LEN;
 	VMALLOC_END = MODULES_VADDR;

--- a/arch/s390/boot/uv.c
+++ b/arch/s390/boot/uv.c
@@ -57,10 +57,11 @@ void uv_query_info(void)
 }

 #if IS_ENABLED(CONFIG_KVM)
-void adjust_to_uv_max(unsigned long *vmax)
+unsigned long adjust_to_uv_max(unsigned long limit)
 {
 	if (is_prot_virt_host() && uv_info.max_sec_stor_addr)
-		*vmax = min_t(unsigned long, *vmax, uv_info.max_sec_stor_addr);
+		limit = min_t(unsigned long, limit, uv_info.max_sec_stor_addr);
+	return limit;
 }

 static int is_prot_virt_host_capable(void)
--- a/arch/s390/boot/uv.h
+++ b/arch/s390/boot/uv.h
@@ -3,10 +3,13 @@
 #define BOOT_UV_H

 #if IS_ENABLED(CONFIG_KVM)
-void adjust_to_uv_max(unsigned long *vmax);
+unsigned long adjust_to_uv_max(unsigned long limit);
 void sanitize_prot_virt_host(void);
 #else
-static inline void adjust_to_uv_max(unsigned long *vmax) {}
+static inline unsigned long adjust_to_uv_max(unsigned long limit)
+{
+	return limit;
+}
 static inline void sanitize_prot_virt_host(void) {}
 #endif

--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -1049,7 +1049,7 @@ out_err:
 	return ret;
 }

-module_cpu_feature_match(MSA, aes_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, aes_s390_init);
 module_exit(aes_s390_fini);

 MODULE_ALIAS_CRYPTO("aes-all");
--- a/arch/s390/crypto/chacha-glue.c
+++ b/arch/s390/crypto/chacha-glue.c
@@ -121,7 +121,7 @@ static void __exit chacha_mod_fini(void)
 		crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs));
 }

-module_cpu_feature_match(VXRS, chacha_mod_init);
+module_cpu_feature_match(S390_CPU_FEATURE_VXRS, chacha_mod_init);
 module_exit(chacha_mod_fini);

 MODULE_DESCRIPTION("ChaCha20 stream cipher");
--- a/arch/s390/crypto/crc32-vx.c
+++ b/arch/s390/crypto/crc32-vx.c
@@ -298,7 +298,7 @@ static void __exit crc_vx_mod_exit(void)
 	crypto_unregister_shashes(crc32_vx_algs, ARRAY_SIZE(crc32_vx_algs));
 }

-module_cpu_feature_match(VXRS, crc_vx_mod_init);
+module_cpu_feature_match(S390_CPU_FEATURE_VXRS, crc_vx_mod_init);
 module_exit(crc_vx_mod_exit);

 MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>");
--- a/arch/s390/crypto/des_s390.c
+++ b/arch/s390/crypto/des_s390.c
@@ -492,7 +492,7 @@ out_err:
 	return ret;
 }

-module_cpu_feature_match(MSA, des_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, des_s390_init);
 module_exit(des_s390_exit);

 MODULE_ALIAS_CRYPTO("des");
--- a/arch/s390/crypto/ghash_s390.c
+++ b/arch/s390/crypto/ghash_s390.c
@@ -145,7 +145,7 @@ static void __exit ghash_mod_exit(void)
 	crypto_unregister_shash(&ghash_alg);
 }

-module_cpu_feature_match(MSA, ghash_mod_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, ghash_mod_init);
 module_exit(ghash_mod_exit);

 MODULE_ALIAS_CRYPTO("ghash");
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -907,5 +907,5 @@ static void __exit prng_exit(void)
 	}
 }

-module_cpu_feature_match(MSA, prng_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, prng_init);
 module_exit(prng_exit);
--- a/arch/s390/crypto/sha1_s390.c
+++ b/arch/s390/crypto/sha1_s390.c
@@ -95,7 +95,7 @@ static void __exit sha1_s390_fini(void)
 	crypto_unregister_shash(&alg);
 }

-module_cpu_feature_match(MSA, sha1_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha1_s390_init);
 module_exit(sha1_s390_fini);

 MODULE_ALIAS_CRYPTO("sha1");
--- a/arch/s390/crypto/sha256_s390.c
+++ b/arch/s390/crypto/sha256_s390.c
@@ -134,7 +134,7 @@ static void __exit sha256_s390_fini(void)
 	crypto_unregister_shash(&sha256_alg);
 }

-module_cpu_feature_match(MSA, sha256_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha256_s390_init);
 module_exit(sha256_s390_fini);

 MODULE_ALIAS_CRYPTO("sha256");
--- a/arch/s390/crypto/sha3_256_s390.c
+++ b/arch/s390/crypto/sha3_256_s390.c
@@ -137,7 +137,7 @@ static void __exit sha3_256_s390_fini(void)
 	crypto_unregister_shash(&sha3_256_alg);
 }

-module_cpu_feature_match(MSA, sha3_256_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init);
 module_exit(sha3_256_s390_fini);

 MODULE_ALIAS_CRYPTO("sha3-256");
--- a/arch/s390/crypto/sha3_512_s390.c
+++ b/arch/s390/crypto/sha3_512_s390.c
@@ -147,7 +147,7 @@ static void __exit fini(void)
 	crypto_unregister_shash(&sha3_384_alg);
 }

-module_cpu_feature_match(MSA, init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
 module_exit(fini);

 MODULE_LICENSE("GPL");
--- a/arch/s390/crypto/sha512_s390.c
+++ b/arch/s390/crypto/sha512_s390.c
@@ -142,7 +142,7 @@ static void __exit fini(void)
 	crypto_unregister_shash(&sha384_alg);
 }

-module_cpu_feature_match(MSA, init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
 module_exit(fini);

 MODULE_LICENSE("GPL");
--- a/arch/s390/include/asm/cpufeature.h
+++ b/arch/s390/include/asm/cpufeature.h
@@ -2,28 +2,21 @@
 /*
 * Module interface for CPU features
 *
- * Copyright IBM Corp. 2015
+ * Copyright IBM Corp. 2015, 2022
 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
 */

 #ifndef __ASM_S390_CPUFEATURE_H
 #define __ASM_S390_CPUFEATURE_H

-#include <asm/elf.h>
+enum {
+	S390_CPU_FEATURE_MSA,
+	S390_CPU_FEATURE_VXRS,
+	S390_CPU_FEATURE_UV,
+	MAX_CPU_FEATURES
+};

-/* Hardware features on Linux on z Systems are indicated by facility bits that
- * are mapped to the so-called machine flags.  Particular machine flags are
- * then used to define ELF hardware capabilities; most notably hardware flags
- * that are essential for user space / glibc.
- *
- * Restrict the set of exposed CPU features to ELF hardware capabilities for
- * now.  Additional machine flags can be indicated by values larger than
- * MAX_ELF_HWCAP_FEATURES.
- */
-#define MAX_ELF_HWCAP_FEATURES	(8 * sizeof(elf_hwcap))
-#define MAX_CPU_FEATURES	MAX_ELF_HWCAP_FEATURES
-
-#define cpu_feature(feat)	ilog2(HWCAP_ ## feat)
+#define cpu_feature(feature)	(feature)

 int cpu_have_feature(unsigned int nr);

--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -42,18 +42,4 @@ typedef struct {
 	.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
 	.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),

-static inline int tprot(unsigned long addr)
-{
-	int rc = -EFAULT;
-
-	asm volatile(
-		"	tprot	0(%1),0\n"
-		"0:	ipm	%0\n"
-		"	srl	%0,28\n"
-		"1:\n"
-		EX_TABLE(0b,1b)
-		: "+d" (rc) : "a" (addr) : "cc");
-	return rc;
-}
-
 #endif
--- a/arch/s390/include/asm/os_info.h
+++ b/arch/s390/include/asm/os_info.h
@@ -8,6 +8,8 @@
 #ifndef _ASM_S390_OS_INFO_H
 #define _ASM_S390_OS_INFO_H

+#include <linux/uio.h>
+
 #define OS_INFO_VERSION_MAJOR	1
 #define OS_INFO_VERSION_MINOR	1
 #define OS_INFO_MAGIC		0x4f53494e464f535aULL /* OSINFOSZ */
@@ -39,7 +41,20 @@ u32 os_info_csum(struct os_info *os_info);

 #ifdef CONFIG_CRASH_DUMP
 void *os_info_old_entry(int nr, unsigned long *size);
-int copy_oldmem_kernel(void *dst, unsigned long src, size_t count);
+size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count);
+
+static inline int copy_oldmem_kernel(void *dst, unsigned long src, size_t count)
+{
+	struct iov_iter iter;
+	struct kvec kvec;
+
+	kvec.iov_base = dst;
+	kvec.iov_len = count;
+	iov_iter_kvec(&iter, WRITE, &kvec, 1, count);
+	if (copy_oldmem_iter(&iter, src, count) < count)
+		return -EFAULT;
+	return 0;
+}
 #else
 static inline void *os_info_old_entry(int nr, unsigned long *size)
 {
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -17,6 +17,7 @@
 #define EXT_SCCB_READ_CPU	(3 * PAGE_SIZE)

 #ifndef __ASSEMBLY__
+#include <linux/uio.h>
 #include <asm/chpid.h>
 #include <asm/cpu.h>

@@ -146,8 +147,7 @@ int sclp_pci_deconfigure(u32 fid);
 int sclp_ap_configure(u32 apid);
 int sclp_ap_deconfigure(u32 apid);
 int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid);
-int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
-int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
+size_t memcpy_hsa_iter(struct iov_iter *iter, unsigned long src, size_t count);
 void sclp_ocf_cpc_name_copy(char *dst);

 static inline int sclp_get_core_info(struct sclp_core_info *info, int early)
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -285,7 +285,6 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned lo
 	return __clear_user(to, n);
 }

-int copy_to_user_real(void __user *dest, unsigned long src, unsigned long count);
 void *s390_kernel_write(void *dst, const void *src, size_t size);

 int __noreturn __put_kernel_bad(void);
--- a/arch/s390/include/asm/unwind.h
+++ b/arch/s390/include/asm/unwind.h
@@ -47,7 +47,7 @@ struct unwind_state {
 static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state,
 						    unsigned long ip)
 {
-	ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL);
+	ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *)state->sp);
 	if (is_kretprobe_trampoline(ip))
 		ip = kretprobe_find_ret_addr(state->task, (void *)state->sp, &state->kr_cur);
 	return ip;
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -35,7 +35,7 @@ CFLAGS_unwind_bc.o	+= -fno-optimize-sibling-calls

 obj-y	:= traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o
 obj-y	+= processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
-obj-y	+= debug.o irq.o ipl.o dis.o diag.o vdso.o
+obj-y	+= debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o
 obj-y	+= sysinfo.o lgr.o os_info.o machine_kexec.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
 obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
--- a/arch/s390/kernel/cpufeature.c
+++ b/arch/s390/kernel/cpufeature.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2022
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/bug.h>
+#include <asm/elf.h>
+
+enum {
+	TYPE_HWCAP,
+	TYPE_FACILITY,
+};
+
+struct s390_cpu_feature {
+	unsigned int type	: 4;
+	unsigned int num	: 28;
+};
+
+static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = {
+	[S390_CPU_FEATURE_MSA]	= {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA},
+	[S390_CPU_FEATURE_VXRS]	= {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS},
+	[S390_CPU_FEATURE_UV]	= {.type = TYPE_FACILITY, .num = 158},
+};
+
+/*
+ * cpu_have_feature - Test CPU features on module initialization
+ */
+int cpu_have_feature(unsigned int num)
+{
+	struct s390_cpu_feature *feature;
+
+	if (WARN_ON_ONCE(num >= MAX_CPU_FEATURES))
+		return 0;
+	feature = &s390_cpu_features[num];
+	switch (feature->type) {
+	case TYPE_HWCAP:
+		return !!(elf_hwcap & BIT(feature->num));
+	case TYPE_FACILITY:
+		return test_facility(feature->num);
+	default:
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(cpu_have_feature);
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -53,6 +53,8 @@ struct save_area {
 };

 static LIST_HEAD(dump_save_areas);
+static DEFINE_MUTEX(memcpy_real_mutex);
+static char memcpy_real_buf[PAGE_SIZE];

 /*
 * Allocate a save area
@@ -63,7 +65,7 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu)

 	sa = memblock_alloc(sizeof(*sa), 8);
 	if (!sa)
-		panic("Failed to allocate save area\n");
+		return NULL;

 	if (is_boot_cpu)
 		list_add(&sa->list, &dump_save_areas);
@@ -114,38 +116,35 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs)
 	memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128));
 }

-/*
- * Return physical address for virtual address
- */
-static inline void *load_real_addr(void *addr)
+static size_t copy_to_iter_real(struct iov_iter *iter, unsigned long src, size_t count)
 {
-	unsigned long real_addr;
+	size_t len, copied, res = 0;

-	asm volatile(
-		   "	lra     %0,0(%1)\n"
-		   "	jz	0f\n"
-		   "	la	%0,0\n"
-		   "0:"
-		   : "=a" (real_addr) : "a" (addr) : "cc");
-	return (void *)real_addr;
+	mutex_lock(&memcpy_real_mutex);
+	while (count) {
+		len = min(PAGE_SIZE, count);
+		if (memcpy_real(memcpy_real_buf, src, len))
+			break;
+		copied = copy_to_iter(memcpy_real_buf, len, iter);
+		count -= copied;
+		src += copied;
+		res += copied;
+		if (copied < len)
+			break;
+	}
+	mutex_unlock(&memcpy_real_mutex);
+	return res;
 }

-/*
- * Copy memory of the old, dumped system to a kernel space virtual address
- */
-int copy_oldmem_kernel(void *dst, unsigned long src, size_t count)
+size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count)
 {
-	unsigned long len;
-	void *ra;
-	int rc;
+	size_t len, copied, res = 0;

 	while (count) {
 		if (!oldmem_data.start && src < sclp.hsa_size) {
 			/* Copy from zfcp/nvme dump HSA area */
 			len = min(count, sclp.hsa_size - src);
-			rc = memcpy_hsa_kernel(dst, src, len);
-			if (rc)
-				return rc;
+			copied = memcpy_hsa_iter(iter, src, len);
 		} else {
 			/* Check for swapped kdump oldmem areas */
 			if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) {
@@ -157,57 +156,15 @@ int copy_oldmem_kernel(void *dst, unsigned long src, size_t count)
 			} else {
 				len = count;
 			}
-			if (is_vmalloc_or_module_addr(dst)) {
-				ra = load_real_addr(dst);
-				len = min(PAGE_SIZE - offset_in_page(ra), len);
-			} else {
-				ra = dst;
-			}
-			if (memcpy_real(ra, src, len))
-				return -EFAULT;
+			copied = copy_to_iter_real(iter, src, len);
 		}
-		dst += len;
-		src += len;
-		count -= len;
+		count -= copied;
+		src += copied;
+		res += copied;
+		if (copied < len)
+			break;
 	}
-	return 0;
-}
-
-/*
- * Copy memory of the old, dumped system to a user space virtual address
- */
-static int copy_oldmem_user(void __user *dst, unsigned long src, size_t count)
-{
-	unsigned long len;
-	int rc;
-
-	while (count) {
-		if (!oldmem_data.start && src < sclp.hsa_size) {
-			/* Copy from zfcp/nvme dump HSA area */
-			len = min(count, sclp.hsa_size - src);
-			rc = memcpy_hsa_user(dst, src, len);
-			if (rc)
-				return rc;
-		} else {
-			/* Check for swapped kdump oldmem areas */
-			if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) {
-				src -= oldmem_data.start;
-				len = min(count, oldmem_data.size - src);
-			} else if (oldmem_data.start && src < oldmem_data.size) {
-				len = min(count, oldmem_data.size - src);
-				src += oldmem_data.start;
-			} else {
-				len = count;
-			}
-			rc = copy_to_user_real(dst, src, count);
-			if (rc)
-				return rc;
-		}
-		dst += len;
-		src += len;
-		count -= len;
-	}
-	return 0;
+	return res;
 }

 /*
@@ -217,26 +174,9 @@ ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
 			 unsigned long offset)
 {
 	unsigned long src;
-	int rc;

-	if (!(iter_is_iovec(iter) || iov_iter_is_kvec(iter)))
-		return -EINVAL;
-	/* Multi-segment iterators are not supported */
-	if (iter->nr_segs > 1)
-		return -EINVAL;
-	if (!csize)
-		return 0;
 	src = pfn_to_phys(pfn) + offset;
-
-	/* XXX: pass the iov_iter down to a common function */
-	if (iter_is_iovec(iter))
-		rc = copy_oldmem_user(iter->iov->iov_base, src, csize);
-	else
-		rc = copy_oldmem_kernel(iter->kvec->iov_base, src, csize);
-	if (rc < 0)
-		return rc;
-	iov_iter_advance(iter, csize);
-	return csize;
+	return copy_oldmem_iter(iter, src, csize);
 }

 /*
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -11,6 +11,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/init.h>
 #include <linux/errno.h>
+#include <linux/entry-common.h>
 #include <linux/hardirq.h>
 #include <linux/log2.h>
 #include <linux/kprobes.h>
@@ -397,11 +398,12 @@ int notrace s390_do_machine_check(struct pt_regs *regs)
 	static unsigned long long last_ipd;
 	struct mcck_struct *mcck;
 	unsigned long long tmp;
+	irqentry_state_t irq_state;
 	union mci mci;
 	unsigned long mcck_dam_code;
 	int mcck_pending = 0;

-	nmi_enter();
+	irq_state = irqentry_nmi_enter(regs);

 	if (user_mode(regs))
 		update_timer_mcck();
@@ -504,14 +506,14 @@ int notrace s390_do_machine_check(struct pt_regs *regs)
 	clear_cpu_flag(CIF_MCCK_GUEST);

 	if (user_mode(regs) && mcck_pending) {
-		nmi_exit();
+		irqentry_nmi_exit(regs, irq_state);
 		return 1;
 	}

 	if (mcck_pending)
 		schedule_mcck_handler();

-	nmi_exit();
+	irqentry_nmi_exit(regs, irq_state);
 	return 0;
 }
 NOKPROBE_SYMBOL(s390_do_machine_check);
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -8,7 +8,6 @@
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

 #include <linux/stop_machine.h>
-#include <linux/cpufeature.h>
 #include <linux/bitops.h>
 #include <linux/kernel.h>
 #include <linux/random.h>
@@ -96,15 +95,6 @@ void cpu_init(void)
 	enter_lazy_tlb(&init_mm, current);
 }

-/*
- * cpu_have_feature - Test CPU features on module initialization
- */
-int cpu_have_feature(unsigned int num)
-{
-	return elf_hwcap & (1UL << num);
-}
-EXPORT_SYMBOL(cpu_have_feature);
-
 static void show_facilities(struct seq_file *m)
 {
 	unsigned int bit;
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -474,19 +474,18 @@ static void __init setup_lowcore_dat_off(void)
 	lc->restart_data = 0;
 	lc->restart_source = -1U;

-	mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE);
-	if (!mcck_stack)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, THREAD_SIZE, THREAD_SIZE);
-	lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET;
-
-	/* Setup absolute zero lowcore */
 	put_abs_lowcore(restart_stack, lc->restart_stack);
 	put_abs_lowcore(restart_fn, lc->restart_fn);
 	put_abs_lowcore(restart_data, lc->restart_data);
 	put_abs_lowcore(restart_source, lc->restart_source);
 	put_abs_lowcore(restart_psw, lc->restart_psw);

+	mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+	if (!mcck_stack)
+		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+		      __func__, THREAD_SIZE, THREAD_SIZE);
+	lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET;
+
 	lc->spinlock_lockval = arch_spin_lockval(0);
 	lc->spinlock_index = 0;
 	arch_spin_lock_setup(0);
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -171,32 +171,6 @@ void memcpy_absolute(void *dest, void *src, size_t count)
 	arch_local_irq_restore(flags);
 }

-/*
- * Copy memory from kernel (real) to user (virtual)
- */
-int copy_to_user_real(void __user *dest, unsigned long src, unsigned long count)
-{
-	int offs = 0, size, rc;
-	char *buf;
-
-	buf = (char *) __get_free_page(GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-	rc = -EFAULT;
-	while (offs < count) {
-		size = min(PAGE_SIZE, count - offs);
-		if (memcpy_real(buf, src + offs, size))
-			goto out;
-		if (copy_to_user(dest + offs, buf, size))
-			goto out;
-		offs += size;
-	}
-	rc = 0;
-out:
-	free_page((unsigned long) buf);
-	return rc;
-}
-
 /*
 * Check if physical address is within prefix or zero page
 */
--- a/drivers/char/hw_random/s390-trng.c
+++ b/drivers/char/hw_random/s390-trng.c
@@ -252,5 +252,5 @@ static void __exit trng_exit(void)
 	trng_debug_exit();
 }

-module_cpu_feature_match(MSA, trng_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, trng_init);
 module_exit(trng_exit);
--- a/drivers/s390/char/Kconfig
+++ b/drivers/s390/char/Kconfig
@@ -89,7 +89,7 @@ config HMC_DRV
 	  Management Console (HMC) drive CD/DVD-ROM. It is available as a
 	  module, called 'hmcdrv', and also as kernel built-in. There is one
 	  optional parameter for this module: cachesize=N, which modifies the
-	  transfer cache size from it's default value 0.5MB to N bytes. If N
+	  transfer cache size from its default value 0.5MB to N bytes. If N
 	  is zero, then no caching is performed.

 config SCLP_OFB
--- a/drivers/s390/char/tape_34xx.c
+++ b/drivers/s390/char/tape_34xx.c
@@ -548,7 +548,7 @@ tape_34xx_unit_check(struct tape_device *device, struct tape_request *request,
 	case 0x2e:
 		/*
 		 * Not capable. This indicates either that the drive fails
-		 * reading the format id mark or that that format specified
+		 * reading the format id mark or that format specified
 		 * is not supported by the drive.
 		 */
 		dev_warn (&device->cdev->dev, "The tape unit cannot process "
--- a/drivers/s390/char/uvdevice.c
+++ b/drivers/s390/char/uvdevice.c
@@ -27,6 +27,7 @@
 #include <linux/stddef.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/cpufeature.h>

 #include <asm/uvdevice.h>
 #include <asm/uv.h>
@@ -244,12 +245,10 @@ static void __exit uvio_dev_exit(void)

 static int __init uvio_dev_init(void)
 {
-	if (!test_facility(158))
-		return -ENXIO;
 	return misc_register(&uvio_dev_miscdev);
 }

-module_init(uvio_dev_init);
+module_cpu_feature_match(S390_CPU_FEATURE_UV, uvio_dev_init);
 module_exit(uvio_dev_exit);

 MODULE_AUTHOR("IBM Corporation");
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -17,6 +17,7 @@
 #include <linux/debugfs.h>
 #include <linux/panic_notifier.h>
 #include <linux/reboot.h>
+#include <linux/uio.h>

 #include <asm/asm-offsets.h>
 #include <asm/ipl.h>
@@ -50,36 +51,41 @@ static struct dentry *zcore_reipl_file;
 static struct dentry *zcore_hsa_file;
 static struct ipl_parameter_block *zcore_ipl_block;

+static DEFINE_MUTEX(hsa_buf_mutex);
 static char hsa_buf[PAGE_SIZE] __aligned(PAGE_SIZE);

 /*
- * Copy memory from HSA to user memory (not reentrant):
+ * Copy memory from HSA to iterator (not reentrant):
 *
- * @dest:  User buffer where memory should be copied to
+ * @iter:  Iterator where memory should be copied to
 * @src:   Start address within HSA where data should be copied
 * @count: Size of buffer, which should be copied
 */
-int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count)
+size_t memcpy_hsa_iter(struct iov_iter *iter, unsigned long src, size_t count)
 {
-	unsigned long offset, bytes;
+	size_t bytes, copied, res = 0;
+	unsigned long offset;

 	if (!hsa_available)
-		return -ENODATA;
+		return 0;

+	mutex_lock(&hsa_buf_mutex);
 	while (count) {
 		if (sclp_sdias_copy(hsa_buf, src / PAGE_SIZE + 2, 1)) {
 			TRACE("sclp_sdias_copy() failed\n");
-			return -EIO;
+			break;
 		}
 		offset = src % PAGE_SIZE;
 		bytes = min(PAGE_SIZE - offset, count);
-		if (copy_to_user(dest, hsa_buf + offset, bytes))
-			return -EFAULT;
-		src += bytes;
-		dest += bytes;
-		count -= bytes;
+		copied = copy_to_iter(hsa_buf + offset, bytes, iter);
+		count -= copied;
+		src += copied;
+		res += copied;
+		if (copied < bytes)
+			break;
 	}
-	return 0;
+	mutex_unlock(&hsa_buf_mutex);
+	return res;
 }

 /*
@@ -89,25 +95,16 @@ int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count)
 * @src:   Start address within HSA where data should be copied
 * @count: Size of buffer, which should be copied
 */
-int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count)
+static inline int memcpy_hsa_kernel(void *dst, unsigned long src, size_t count)
 {
-	unsigned long offset, bytes;
+	struct iov_iter iter;
+	struct kvec kvec;

-	if (!hsa_available)
-		return -ENODATA;
-
-	while (count) {
-		if (sclp_sdias_copy(hsa_buf, src / PAGE_SIZE + 2, 1)) {
-			TRACE("sclp_sdias_copy() failed\n");
-			return -EIO;
-		}
-		offset = src % PAGE_SIZE;
-		bytes = min(PAGE_SIZE - offset, count);
-		memcpy(dest, hsa_buf + offset, bytes);
-		src += bytes;
-		dest += bytes;
-		count -= bytes;
-	}
+	kvec.iov_base = dst;
+	kvec.iov_len = count;
+	iov_iter_kvec(&iter, WRITE, &kvec, 1, count);
+	if (memcpy_hsa_iter(&iter, src, count) < count)
+		return -EIO;
 	return 0;
 }

--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -838,6 +838,17 @@ static void ap_bus_revise_bindings(void)
 	bus_for_each_dev(&ap_bus_type, NULL, NULL, __ap_revise_reserved);
 }

+/**
+ * ap_owned_by_def_drv: indicates whether an AP adapter is reserved for the
+ *			default host driver or not.
+ * @card: the APID of the adapter card to check
+ * @queue: the APQI of the queue to check
+ *
+ * Note: the ap_perms_mutex must be locked by the caller of this function.
+ *
+ * Return: an int specifying whether the AP adapter is reserved for the host (1)
+ *	   or not (0).
+ */
 int ap_owned_by_def_drv(int card, int queue)
 {
 	int rc = 0;
@@ -845,25 +856,31 @@ int ap_owned_by_def_drv(int card, int queue)
 	if (card < 0 || card >= AP_DEVICES || queue < 0 || queue >= AP_DOMAINS)
 		return -EINVAL;

-	mutex_lock(&ap_perms_mutex);
-
 	if (test_bit_inv(card, ap_perms.apm) &&
 	    test_bit_inv(queue, ap_perms.aqm))
 		rc = 1;

-	mutex_unlock(&ap_perms_mutex);
-
 	return rc;
 }
 EXPORT_SYMBOL(ap_owned_by_def_drv);

+/**
+ * ap_apqn_in_matrix_owned_by_def_drv: indicates whether every APQN contained in
+ *				       a set is reserved for the host drivers
+ *				       or not.
+ * @apm: a bitmap specifying a set of APIDs comprising the APQNs to check
+ * @aqm: a bitmap specifying a set of APQIs comprising the APQNs to check
+ *
+ * Note: the ap_perms_mutex must be locked by the caller of this function.
+ *
+ * Return: an int specifying whether each APQN is reserved for the host (1) or
+ *	   not (0)
+ */
 int ap_apqn_in_matrix_owned_by_def_drv(unsigned long *apm,
 				       unsigned long *aqm)
 {
 	int card, queue, rc = 0;

-	mutex_lock(&ap_perms_mutex);
-
 	for (card = 0; !rc && card < AP_DEVICES; card++)
 		if (test_bit_inv(card, apm) &&
 		    test_bit_inv(card, ap_perms.apm))
@@ -872,8 +889,6 @@ int ap_apqn_in_matrix_owned_by_def_drv(unsigned long *apm,
 				    test_bit_inv(queue, ap_perms.aqm))
 					rc = 1;

-	mutex_unlock(&ap_perms_mutex);
-
 	return rc;
 }
 EXPORT_SYMBOL(ap_apqn_in_matrix_owned_by_def_drv);
--- a/drivers/s390/crypto/pkey_api.c
+++ b/drivers/s390/crypto/pkey_api.c
@@ -2115,5 +2115,5 @@ static void __exit pkey_exit(void)
 	pkey_debug_exit();
 }

-module_cpu_feature_match(MSA, pkey_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, pkey_init);
 module_exit(pkey_exit);
--- a/drivers/s390/crypto/vfio_ap_drv.c
+++ b/drivers/s390/crypto/vfio_ap_drv.c
@@ -18,9 +18,6 @@

 #define VFIO_AP_ROOT_NAME "vfio_ap"
 #define VFIO_AP_DEV_NAME "matrix"
-#define AP_QUEUE_ASSIGNED "assigned"
-#define AP_QUEUE_UNASSIGNED "unassigned"
-#define AP_QUEUE_IN_USE "in use"

 MODULE_AUTHOR("IBM Corporation");
 MODULE_DESCRIPTION("VFIO AP device driver, Copyright IBM Corp. 2018");
@@ -46,120 +43,12 @@ static struct ap_device_id ap_queue_ids[] = {
 	{ /* end of sibling */ },
 };

-static struct ap_matrix_mdev *vfio_ap_mdev_for_queue(struct vfio_ap_queue *q)
-{
-	struct ap_matrix_mdev *matrix_mdev;
-	unsigned long apid = AP_QID_CARD(q->apqn);
-	unsigned long apqi = AP_QID_QUEUE(q->apqn);
-
-	list_for_each_entry(matrix_mdev, &matrix_dev->mdev_list, node) {
-		if (test_bit_inv(apid, matrix_mdev->matrix.apm) &&
-		    test_bit_inv(apqi, matrix_mdev->matrix.aqm))
-			return matrix_mdev;
-	}
-
-	return NULL;
-}
-
-static ssize_t status_show(struct device *dev,
-			   struct device_attribute *attr,
-			   char *buf)
-{
-	ssize_t nchars = 0;
-	struct vfio_ap_queue *q;
-	struct ap_matrix_mdev *matrix_mdev;
-	struct ap_device *apdev = to_ap_dev(dev);
-
-	mutex_lock(&matrix_dev->lock);
-	q = dev_get_drvdata(&apdev->device);
-	matrix_mdev = vfio_ap_mdev_for_queue(q);
-
-	if (matrix_mdev) {
-		if (matrix_mdev->kvm)
-			nchars = scnprintf(buf, PAGE_SIZE, "%s\n",
-					   AP_QUEUE_IN_USE);
-		else
-			nchars = scnprintf(buf, PAGE_SIZE, "%s\n",
-					   AP_QUEUE_ASSIGNED);
-	} else {
-		nchars = scnprintf(buf, PAGE_SIZE, "%s\n",
-				   AP_QUEUE_UNASSIGNED);
-	}
-
-	mutex_unlock(&matrix_dev->lock);
-
-	return nchars;
-}
-
-static DEVICE_ATTR_RO(status);
-
-static struct attribute *vfio_queue_attrs[] = {
-	&dev_attr_status.attr,
-	NULL,
-};
-
-static const struct attribute_group vfio_queue_attr_group = {
-	.attrs = vfio_queue_attrs,
-};
-
-/**
- * vfio_ap_queue_dev_probe: Allocate a vfio_ap_queue structure and associate it
- *			    with the device as driver_data.
- *
- * @apdev: the AP device being probed
- *
- * Return: returns 0 if the probe succeeded; otherwise, returns an error if
- *	   storage could not be allocated for a vfio_ap_queue object or the
- *	   sysfs 'status' attribute could not be created for the queue device.
- */
-static int vfio_ap_queue_dev_probe(struct ap_device *apdev)
-{
-	int ret;
-	struct vfio_ap_queue *q;
-
-	q = kzalloc(sizeof(*q), GFP_KERNEL);
-	if (!q)
-		return -ENOMEM;
-
-	mutex_lock(&matrix_dev->lock);
-	dev_set_drvdata(&apdev->device, q);
-	q->apqn = to_ap_queue(&apdev->device)->qid;
-	q->saved_isc = VFIO_AP_ISC_INVALID;
-
-	ret = sysfs_create_group(&apdev->device.kobj, &vfio_queue_attr_group);
-	if (ret) {
-		dev_set_drvdata(&apdev->device, NULL);
-		kfree(q);
-	}
-
-	mutex_unlock(&matrix_dev->lock);
-
-	return ret;
-}
-
-/**
- * vfio_ap_queue_dev_remove: Free the associated vfio_ap_queue structure.
- *
- * @apdev: the AP device being removed
- *
- * Takes the matrix lock to avoid actions on this device while doing the remove.
- */
-static void vfio_ap_queue_dev_remove(struct ap_device *apdev)
-{
-	struct vfio_ap_queue *q;
-
-	mutex_lock(&matrix_dev->lock);
-	sysfs_remove_group(&apdev->device.kobj, &vfio_queue_attr_group);
-	q = dev_get_drvdata(&apdev->device);
-	vfio_ap_mdev_reset_queue(q, 1);
-	dev_set_drvdata(&apdev->device, NULL);
-	kfree(q);
-	mutex_unlock(&matrix_dev->lock);
-}
-
 static struct ap_driver vfio_ap_drv = {
-	.probe = vfio_ap_queue_dev_probe,
-	.remove = vfio_ap_queue_dev_remove,
+	.probe = vfio_ap_mdev_probe_queue,
+	.remove = vfio_ap_mdev_remove_queue,
+	.in_use = vfio_ap_mdev_resource_in_use,
+	.on_config_changed = vfio_ap_on_cfg_changed,
+	.on_scan_complete = vfio_ap_on_scan_complete,
 	.ids = ap_queue_ids,
 };

@@ -212,8 +101,9 @@ static int vfio_ap_matrix_dev_create(void)
 			goto matrix_alloc_err;
 	}

-	mutex_init(&matrix_dev->lock);
+	mutex_init(&matrix_dev->mdevs_lock);
 	INIT_LIST_HEAD(&matrix_dev->mdev_list);
+	mutex_init(&matrix_dev->guests_lock);

 	dev_set_name(&matrix_dev->device, "%s", VFIO_AP_DEV_NAME);
 	matrix_dev->device.parent = root_device;
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
--- a/drivers/s390/crypto/vfio_ap_private.h
+++ b/drivers/s390/crypto/vfio_ap_private.h
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <linux/kvm_host.h>
 #include <linux/vfio.h>
+#include <linux/hashtable.h>

 #include "ap_bus.h"

@@ -32,20 +33,26 @@
 * @available_instances: number of mediated matrix devices that can be created
 * @info:	the struct containing the output from the PQAP(QCI) instruction
 * @mdev_list:	the list of mediated matrix devices created
- * @lock:	mutex for locking the AP matrix device. This lock will be
+ * @mdevs_lock: mutex for locking the AP matrix device. This lock will be
 *		taken every time we fiddle with state managed by the vfio_ap
 *		driver, be it using @mdev_list or writing the state of a
 *		single ap_matrix_mdev device. It's quite coarse but we don't
 *		expect much contention.
 * @vfio_ap_drv: the vfio_ap device driver
+ * @guests_lock: mutex for controlling access to a guest that is using AP
+ *		 devices passed through by the vfio_ap device driver. This lock
+ *		 will be taken when the AP devices are plugged into or unplugged
+ *		 from a guest, and when an ap_matrix_mdev device is added to or
+ *		 removed from @mdev_list or the list is iterated.
 */
 struct ap_matrix_dev {
 	struct device device;
 	atomic_t available_instances;
 	struct ap_config_info info;
 	struct list_head mdev_list;
-	struct mutex lock;
+	struct mutex mdevs_lock; /* serializes access to each ap_matrix_mdev */
 	struct ap_driver  *vfio_ap_drv;
+	struct mutex guests_lock; /* serializes access to each KVM guest */
 };

 extern struct ap_matrix_dev *matrix_dev;
@@ -74,6 +81,15 @@ struct ap_matrix {
 	DECLARE_BITMAP(adm, 256);
 };

+/**
+ * struct ap_queue_table - a table of queue objects.
+ *
+ * @queues: a hashtable of queues (struct vfio_ap_queue).
+ */
+struct ap_queue_table {
+	DECLARE_HASHTABLE(queues, 8);
+};
+
 /**
 * struct ap_matrix_mdev - Contains the data associated with a matrix mediated
 *			   device.
@@ -81,18 +97,29 @@ struct ap_matrix {
 * @node:	allows the ap_matrix_mdev struct to be added to a list
 * @matrix:	the adapters, usage domains and control domains assigned to the
 *		mediated matrix device.
+ * @shadow_apcb:    the shadow copy of the APCB field of the KVM guest's CRYCB
 * @kvm:	the struct holding guest's state
 * @pqap_hook:	the function pointer to the interception handler for the
 *		PQAP(AQIC) instruction.
 * @mdev:	the mediated device
+ * @qtable:	table of queues (struct vfio_ap_queue) assigned to the mdev
+ * @apm_add:	bitmap of APIDs added to the host's AP configuration
+ * @aqm_add:	bitmap of APQIs added to the host's AP configuration
+ * @adm_add:	bitmap of control domain numbers added to the host's AP
+ *		configuration
 */
 struct ap_matrix_mdev {
 	struct vfio_device vdev;
 	struct list_head node;
 	struct ap_matrix matrix;
+	struct ap_matrix shadow_apcb;
 	struct kvm *kvm;
 	crypto_hook pqap_hook;
 	struct mdev_device *mdev;
+	struct ap_queue_table qtable;
+	DECLARE_BITMAP(apm_add, AP_DEVICES);
+	DECLARE_BITMAP(aqm_add, AP_DOMAINS);
+	DECLARE_BITMAP(adm_add, AP_DOMAINS);
 };

 /**
@@ -102,6 +129,8 @@ struct ap_matrix_mdev {
 * @saved_iova: the notification indicator byte (nib) address
 * @apqn: the APQN of the AP queue device
 * @saved_isc: the guest ISC registered with the GIB interface
+ * @mdev_qnode: allows the vfio_ap_queue struct to be added to a hashtable
+ * @reset_rc: the status response code from the last reset of the queue
 */
 struct vfio_ap_queue {
 	struct ap_matrix_mdev *matrix_mdev;
@@ -109,11 +138,21 @@ struct vfio_ap_queue {
 	int	apqn;
 #define VFIO_AP_ISC_INVALID 0xff
 	unsigned char saved_isc;
+	struct hlist_node mdev_qnode;
+	unsigned int reset_rc;
 };

 int vfio_ap_mdev_register(void);
 void vfio_ap_mdev_unregister(void);
-int vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q,
-			     unsigned int retry);
+
+int vfio_ap_mdev_probe_queue(struct ap_device *queue);
+void vfio_ap_mdev_remove_queue(struct ap_device *queue);
+
+int vfio_ap_mdev_resource_in_use(unsigned long *apm, unsigned long *aqm);
+
+void vfio_ap_on_cfg_changed(struct ap_config_info *new_config_info,
+			    struct ap_config_info *old_config_info);
+void vfio_ap_on_scan_complete(struct ap_config_info *new_config_info,
+			      struct ap_config_info *old_config_info);

 #endif /* _VFIO_AP_PRIVATE_H_ */
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -420,6 +420,7 @@ typedef struct elf64_shdr {
 #define NT_S390_GS_CB	0x30b		/* s390 guarded storage registers */
 #define NT_S390_GS_BC	0x30c		/* s390 guarded storage broadcast control block */
 #define NT_S390_RI_CB	0x30d		/* s390 runtime instrumentation */
+#define NT_S390_PV_CPU_DATA	0x30e	/* s390 protvirt cpu dump data */
 #define NT_ARM_VFP	0x400		/* ARM VFP/NEON registers */
 #define NT_ARM_TLS	0x401		/* ARM TLS register */
 #define NT_ARM_HW_BREAK	0x402		/* ARM hardware breakpoint registers */
--- a/tools/testing/crypto/chacha20-s390/test-cipher.c
+++ b/tools/testing/crypto/chacha20-s390/test-cipher.c
@@ -252,29 +252,26 @@ static int __init chacha_s390_test_init(void)
 	memset(plain, 'a', data_size);
 	get_random_bytes(plain, (data_size > 256 ? 256 : data_size));

-	cipher_generic = vmalloc(data_size);
+	cipher_generic = vzalloc(data_size);
 	if (!cipher_generic) {
 		pr_info("could not allocate cipher_generic buffer\n");
 		ret = -2;
 		goto out;
 	}
-	memset(cipher_generic, 0, data_size);

-	cipher_s390 = vmalloc(data_size);
+	cipher_s390 = vzalloc(data_size);
 	if (!cipher_s390) {
 		pr_info("could not allocate cipher_s390 buffer\n");
 		ret = -2;
 		goto out;
 	}
-	memset(cipher_s390, 0, data_size);

-	revert = vmalloc(data_size);
+	revert = vzalloc(data_size);
 	if (!revert) {
 		pr_info("could not allocate revert buffer\n");
 		ret = -2;
 		goto out;
 	}
-	memset(revert, 0, data_size);

 	if (debug)
 		print_hex_dump(KERN_INFO, "src: ", DUMP_PREFIX_OFFSET,