From 6bd06f5a486c06023a618a86e8153b91d26f75f4 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Wed, 21 Mar 2018 12:46:19 -0600
Subject: [PATCH 1/7] vfio/type1: Adopt fast IOTLB flush interface when unmap
 IOVAs

VFIO IOMMU type1 currently upmaps IOVA pages synchronously, which requires
IOTLB flushing for every unmapping. This results in large IOTLB flushing
overhead when handling pass-through devices has a large number of mapped
IOVAs. This can be avoided by using the new IOTLB flushing interface.

Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
[aw - use LIST_HEAD]
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 126 +++++++++++++++++++++++++++++---
 1 file changed, 115 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 45657e2b1ff7..3c082451ab1a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -102,6 +102,13 @@ struct vfio_pfn {
 	atomic_t		ref_count;
 };
 
+struct vfio_regions {
+	struct list_head list;
+	dma_addr_t iova;
+	phys_addr_t phys;
+	size_t len;
+};
+
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
 					(!list_empty(&iommu->domain_list))
 
@@ -660,11 +667,102 @@ unpin_exit:
 	return i > npage ? npage : (i > 0 ? i : -EINVAL);
 }
 
+static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
+				struct list_head *regions)
+{
+	long unlocked = 0;
+	struct vfio_regions *entry, *next;
+
+	iommu_tlb_sync(domain->domain);
+
+	list_for_each_entry_safe(entry, next, regions, list) {
+		unlocked += vfio_unpin_pages_remote(dma,
+						    entry->iova,
+						    entry->phys >> PAGE_SHIFT,
+						    entry->len >> PAGE_SHIFT,
+						    false);
+		list_del(&entry->list);
+		kfree(entry);
+	}
+
+	cond_resched();
+
+	return unlocked;
+}
+
+/*
+ * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
+ * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
+ * of these regions (currently using a list).
+ *
+ * This value specifies maximum number of regions for each IOTLB flush sync.
+ */
+#define VFIO_IOMMU_TLB_SYNC_MAX		512
+
+static size_t unmap_unpin_fast(struct vfio_domain *domain,
+			       struct vfio_dma *dma, dma_addr_t *iova,
+			       size_t len, phys_addr_t phys, long *unlocked,
+			       struct list_head *unmapped_list,
+			       int *unmapped_cnt)
+{
+	size_t unmapped = 0;
+	struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+
+	if (entry) {
+		unmapped = iommu_unmap_fast(domain->domain, *iova, len);
+
+		if (!unmapped) {
+			kfree(entry);
+		} else {
+			iommu_tlb_range_add(domain->domain, *iova, unmapped);
+			entry->iova = *iova;
+			entry->phys = phys;
+			entry->len  = unmapped;
+			list_add_tail(&entry->list, unmapped_list);
+
+			*iova += unmapped;
+			(*unmapped_cnt)++;
+		}
+	}
+
+	/*
+	 * Sync if the number of fast-unmap regions hits the limit
+	 * or in case of errors.
+	 */
+	if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
+		*unlocked += vfio_sync_unpin(dma, domain,
+					     unmapped_list);
+		*unmapped_cnt = 0;
+	}
+
+	return unmapped;
+}
+
+static size_t unmap_unpin_slow(struct vfio_domain *domain,
+			       struct vfio_dma *dma, dma_addr_t *iova,
+			       size_t len, phys_addr_t phys,
+			       long *unlocked)
+{
+	size_t unmapped = iommu_unmap(domain->domain, *iova, len);
+
+	if (unmapped) {
+		*unlocked += vfio_unpin_pages_remote(dma, *iova,
+						     phys >> PAGE_SHIFT,
+						     unmapped >> PAGE_SHIFT,
+						     false);
+		*iova += unmapped;
+		cond_resched();
+	}
+	return unmapped;
+}
+
 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 			     bool do_accounting)
 {
 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
 	struct vfio_domain *domain, *d;
+	LIST_HEAD(unmapped_region_list);
+	int unmapped_region_cnt = 0;
 	long unlocked = 0;
 
 	if (!dma->size)
@@ -710,20 +808,26 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 				break;
 		}
 
-		unmapped = iommu_unmap(domain->domain, iova, len);
-		if (WARN_ON(!unmapped))
-			break;
-
-		unlocked += vfio_unpin_pages_remote(dma, iova,
-						    phys >> PAGE_SHIFT,
-						    unmapped >> PAGE_SHIFT,
-						    false);
-		iova += unmapped;
-
-		cond_resched();
+		/*
+		 * First, try to use fast unmap/unpin. In case of failure,
+		 * switch to slow unmap/unpin path.
+		 */
+		unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
+					    &unlocked, &unmapped_region_list,
+					    &unmapped_region_cnt);
+		if (!unmapped) {
+			unmapped = unmap_unpin_slow(domain, dma, &iova, len,
+						    phys, &unlocked);
+			if (WARN_ON(!unmapped))
+				break;
+		}
 	}
 
 	dma->iommu_mapped = false;
+
+	if (unmapped_region_cnt)
+		unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list);
+
 	if (do_accounting) {
 		vfio_lock_acct(dma->task, -unlocked, NULL);
 		return 0;

From c9f89c3f87cfc026d88c08054710902dd52a7772 Mon Sep 17 00:00:00 2001
From: Shunyong Yang <shunyong.yang@hxt-semitech.com>
Date: Wed, 21 Mar 2018 12:46:23 -0600
Subject: [PATCH 2/7] vfio-mdev/samples: change RDI interrupt condition

When FIFO mode is enabled, the receive data available interrupt
(UART_IIR_RDI in code) should be triggered when the number of data
in FIFO is equal or larger than interrupt trigger level.

This patch changes the trigger level check to ensure multiple bytes
received from upper layer can trigger RDI interrupt correctly.

Cc: Joey Zheng <yu.zheng@hxt-semitech.com>
Signed-off-by: Shunyong Yang <shunyong.yang@hxt-semitech.com>
Reviewed by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 samples/vfio-mdev/mtty.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 09f255bdf3ac..7abb79d8313d 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -534,7 +534,7 @@ static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state,
 
 		/* Interrupt priority 2: Fifo trigger level reached */
 		if ((ier & UART_IER_RDI) &&
-		    (mdev_state->s[index].rxtx.count ==
+		    (mdev_state->s[index].rxtx.count >=
 		      mdev_state->s[index].intr_trigger_level))
 			*buf |= UART_IIR_RDI;
 

From 356e88ebe4473a3663cf3d14727ce293a4526d34 Mon Sep 17 00:00:00 2001
From: "Jason Cai (Xiang Feng)" <jason.cai@linux.alibaba.com>
Date: Thu, 22 Mar 2018 12:52:16 +0800
Subject: [PATCH 3/7] vfio/type1: Improve memory pinning process for raw PFN
 mapping

When using vfio to pass through a PCIe device (e.g. a GPU card) that
has a huge BAR (e.g. 16GB), a lot of cycles are wasted on memory
pinning because PFNs of PCI BAR are not backed by struct page, and
the corresponding VMA has flag VM_PFNMAP.

With this change, when pinning a region which is a raw PFN mapping,
it can skip unnecessary user memory pinning process, and thus, can
significantly improve VM's boot up time when passing through devices
via VFIO. In my test on a Xeon E5 2.6GHz, the time mapping a 16GB
BAR was reduced from about 0.4s to 1.5us.

Signed-off-by: Jason Cai (Xiang Feng) <jason.cai@linux.alibaba.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3c082451ab1a..5c212bf29640 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -404,7 +404,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 {
 	unsigned long pfn = 0;
 	long ret, pinned = 0, lock_acct = 0;
-	bool rsvd;
 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
 
 	/* This code path is only user initiated */
@@ -415,14 +414,23 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	if (ret)
 		return ret;
 
+	if (is_invalid_reserved_pfn(*pfn_base)) {
+		struct vm_area_struct *vma;
+
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+		pinned = min_t(long, npage, vma_pages(vma));
+		up_read(&current->mm->mmap_sem);
+		return pinned;
+	}
+
 	pinned++;
-	rsvd = is_invalid_reserved_pfn(*pfn_base);
 
 	/*
 	 * Reserved pages aren't counted against the user, externally pinned
 	 * pages are already counted against the user.
 	 */
-	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+	if (!vfio_find_vpfn(dma, iova)) {
 		if (!lock_cap && current->mm->locked_vm + 1 > limit) {
 			put_pfn(*pfn_base, dma->prot);
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
@@ -442,13 +450,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 		if (ret)
 			break;
 
-		if (pfn != *pfn_base + pinned ||
-		    rsvd != is_invalid_reserved_pfn(pfn)) {
+		if (pfn != *pfn_base + pinned) {
 			put_pfn(pfn, dma->prot);
 			break;
 		}
 
-		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+		if (!vfio_find_vpfn(dma, iova)) {
 			if (!lock_cap &&
 			    current->mm->locked_vm + lock_acct + 1 > limit) {
 				put_pfn(pfn, dma->prot);
@@ -466,10 +473,8 @@ out:
 
 unpin_out:
 	if (ret) {
-		if (!rsvd) {
-			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
-				put_pfn(pfn, dma->prot);
-		}
+		for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
+			put_pfn(pfn, dma->prot);
 
 		return ret;
 	}

From 0d77ed3589ac054d197ccde7231e36f9e032426c Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 21 Mar 2018 12:46:20 -0600
Subject: [PATCH 4/7] vfio/pci: Pull BAR mapping setup from read-write path

This creates a common helper that we'll use for ioeventfd setup.

Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci_rdwr.c | 39 ++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 357243d76f10..5f2b376dcebd 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -113,6 +113,30 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 	return done;
 }
 
+static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+	void __iomem *io;
+
+	if (vdev->barmap[bar])
+		return 0;
+
+	ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
+	if (ret)
+		return ret;
+
+	io = pci_iomap(pdev, bar, 0);
+	if (!io) {
+		pci_release_selected_regions(pdev, 1 << bar);
+		return -ENOMEM;
+	}
+
+	vdev->barmap[bar] = io;
+
+	return 0;
+}
+
 ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 			size_t count, loff_t *ppos, bool iswrite)
 {
@@ -147,22 +171,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 		if (!io)
 			return -ENOMEM;
 		x_end = end;
-	} else if (!vdev->barmap[bar]) {
-		int ret;
-
-		ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
+	} else {
+		int ret = vfio_pci_setup_barmap(vdev, bar);
 		if (ret)
 			return ret;
 
-		io = pci_iomap(pdev, bar, 0);
-		if (!io) {
-			pci_release_selected_regions(pdev, 1 << bar);
-			return -ENOMEM;
-		}
-
-		vdev->barmap[bar] = io;
-	} else
 		io = vdev->barmap[bar];
+	}
 
 	if (bar == vdev->msix_bar) {
 		x_start = vdev->msix_offset;

From 07fd7ef3a1c25a11015bb5821c9c5982f722d4a2 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 21 Mar 2018 12:46:21 -0600
Subject: [PATCH 5/7] vfio/pci: Use endian neutral helpers

The iowriteXX/ioreadXX functions assume little endian hardware and
convert to little endian on a write and from little endian on a read.
We currently do our own explicit conversion to negate this.  Instead,
add some endian dependent defines to avoid all byte swaps.  There
should be no functional change other than big endian systems aren't
penalized with wasted swaps.

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci_rdwr.c | 34 ++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 5f2b376dcebd..925419e0f459 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -21,6 +21,24 @@
 
 #include "vfio_pci_private.h"
 
+#ifdef __LITTLE_ENDIAN
+#define vfio_ioread64	ioread64
+#define vfio_iowrite64	iowrite64
+#define vfio_ioread32	ioread32
+#define vfio_iowrite32	iowrite32
+#define vfio_ioread16	ioread16
+#define vfio_iowrite16	iowrite16
+#else
+#define vfio_ioread64	ioread64be
+#define vfio_iowrite64	iowrite64be
+#define vfio_ioread32	ioread32be
+#define vfio_iowrite32	iowrite32be
+#define vfio_ioread16	ioread16be
+#define vfio_iowrite16	iowrite16be
+#endif
+#define vfio_ioread8	ioread8
+#define vfio_iowrite8	iowrite8
+
 /*
  * Read or write from an __iomem region (MMIO or I/O port) with an excluded
  * range which is inaccessible.  The excluded range drops writes and fills
@@ -44,15 +62,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 			fillable = 0;
 
 		if (fillable >= 4 && !(off % 4)) {
-			__le32 val;
+			u32 val;
 
 			if (iswrite) {
 				if (copy_from_user(&val, buf, 4))
 					return -EFAULT;
 
-				iowrite32(le32_to_cpu(val), io + off);
+				vfio_iowrite32(val, io + off);
 			} else {
-				val = cpu_to_le32(ioread32(io + off));
+				val = vfio_ioread32(io + off);
 
 				if (copy_to_user(buf, &val, 4))
 					return -EFAULT;
@@ -60,15 +78,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 
 			filled = 4;
 		} else if (fillable >= 2 && !(off % 2)) {
-			__le16 val;
+			u16 val;
 
 			if (iswrite) {
 				if (copy_from_user(&val, buf, 2))
 					return -EFAULT;
 
-				iowrite16(le16_to_cpu(val), io + off);
+				vfio_iowrite16(val, io + off);
 			} else {
-				val = cpu_to_le16(ioread16(io + off));
+				val = vfio_ioread16(io + off);
 
 				if (copy_to_user(buf, &val, 2))
 					return -EFAULT;
@@ -82,9 +100,9 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 				if (copy_from_user(&val, buf, 1))
 					return -EFAULT;
 
-				iowrite8(val, io + off);
+				vfio_iowrite8(val, io + off);
 			} else {
-				val = ioread8(io + off);
+				val = vfio_ioread8(io + off);
 
 				if (copy_to_user(buf, &val, 1))
 					return -EFAULT;

From 30656177c4080460b936709ff6648f201d7d2c1a Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 21 Mar 2018 12:46:21 -0600
Subject: [PATCH 6/7] vfio/pci: Add ioeventfd support

The ioeventfd here is actually irqfd handling of an ioeventfd such as
supported in KVM.  A user is able to pre-program a device write to
occur when the eventfd triggers.  This is yet another instance of
eventfd-irqfd triggering between KVM and vfio.  The impetus for this
is high frequency writes to pages which are virtualized in QEMU.
Enabling this near-direct write path for selected registers within
the virtualized page can improve performance and reduce overhead.
Specifically this is initially targeted at NVIDIA graphics cards where
the driver issues a write to an MMIO register within a virtualized
region in order to allow the MSI interrupt to re-trigger.

Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci.c         |  35 +++++++++
 drivers/vfio/pci/vfio_pci_private.h |  19 +++++
 drivers/vfio/pci/vfio_pci_rdwr.c    | 111 ++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h           |  27 +++++++
 4 files changed, 192 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index b0f759476900..c6822149b394 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -305,6 +305,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	struct vfio_pci_dummy_resource *dummy_res, *tmp;
+	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
 	int i, bar;
 
 	/* Stop the device from further DMA */
@@ -314,6 +315,15 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 				VFIO_IRQ_SET_ACTION_TRIGGER,
 				vdev->irq_type, 0, 0, NULL);
 
+	/* Device closed, don't need mutex here */
+	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
+				 &vdev->ioeventfds_list, next) {
+		vfio_virqfd_disable(&ioeventfd->virqfd);
+		list_del(&ioeventfd->next);
+		kfree(ioeventfd);
+	}
+	vdev->ioeventfds_nr = 0;
+
 	vdev->virq_disabled = false;
 
 	for (i = 0; i < vdev->num_regions; i++)
@@ -1012,6 +1022,28 @@ hot_reset_release:
 
 		kfree(groups);
 		return ret;
+	} else if (cmd == VFIO_DEVICE_IOEVENTFD) {
+		struct vfio_device_ioeventfd ioeventfd;
+		int count;
+
+		minsz = offsetofend(struct vfio_device_ioeventfd, fd);
+
+		if (copy_from_user(&ioeventfd, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (ioeventfd.argsz < minsz)
+			return -EINVAL;
+
+		if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
+			return -EINVAL;
+
+		count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
+
+		if (hweight8(count) != 1 || ioeventfd.fd < -1)
+			return -EINVAL;
+
+		return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
+					  ioeventfd.data, count, ioeventfd.fd);
 	}
 
 	return -ENOTTY;
@@ -1174,6 +1206,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
 	mutex_init(&vdev->igate);
 	spin_lock_init(&vdev->irqlock);
+	mutex_init(&vdev->ioeventfds_lock);
+	INIT_LIST_HEAD(&vdev->ioeventfds_list);
 
 	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
 	if (ret) {
@@ -1215,6 +1249,7 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 
 	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
 	kfree(vdev->region);
+	mutex_destroy(&vdev->ioeventfds_lock);
 	kfree(vdev);
 
 	if (vfio_pci_is_vga(pdev)) {
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index f561ac1c78a0..cde3b5d3441a 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -29,6 +29,19 @@
 #define PCI_CAP_ID_INVALID		0xFF	/* default raw access */
 #define PCI_CAP_ID_INVALID_VIRT		0xFE	/* default virt access */
 
+/* Cap maximum number of ioeventfds per device (arbitrary) */
+#define VFIO_PCI_IOEVENTFD_MAX		1000
+
+struct vfio_pci_ioeventfd {
+	struct list_head	next;
+	struct virqfd		*virqfd;
+	void __iomem		*addr;
+	uint64_t		data;
+	loff_t			pos;
+	int			bar;
+	int			count;
+};
+
 struct vfio_pci_irq_ctx {
 	struct eventfd_ctx	*trigger;
 	struct virqfd		*unmask;
@@ -92,9 +105,12 @@ struct vfio_pci_device {
 	bool			nointx;
 	struct pci_saved_state	*pci_saved_state;
 	int			refcnt;
+	int			ioeventfds_nr;
 	struct eventfd_ctx	*err_trigger;
 	struct eventfd_ctx	*req_trigger;
 	struct list_head	dummy_resources_list;
+	struct mutex		ioeventfds_lock;
+	struct list_head	ioeventfds_list;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
@@ -120,6 +136,9 @@ extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
 			       size_t count, loff_t *ppos, bool iswrite);
 
+extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
+			       uint64_t data, int count, int fd);
+
 extern int vfio_pci_init_perm_bits(void);
 extern void vfio_pci_uninit_perm_bits(void);
 
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 925419e0f459..a6029d0a5524 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -17,6 +17,7 @@
 #include <linux/pci.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/vfio.h>
 #include <linux/vgaarb.h>
 
 #include "vfio_pci_private.h"
@@ -275,3 +276,113 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
 
 	return done;
 }
+
+static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
+{
+	struct vfio_pci_ioeventfd *ioeventfd = opaque;
+
+	switch (ioeventfd->count) {
+	case 1:
+		vfio_iowrite8(ioeventfd->data, ioeventfd->addr);
+		break;
+	case 2:
+		vfio_iowrite16(ioeventfd->data, ioeventfd->addr);
+		break;
+	case 4:
+		vfio_iowrite32(ioeventfd->data, ioeventfd->addr);
+		break;
+#ifdef iowrite64
+	case 8:
+		vfio_iowrite64(ioeventfd->data, ioeventfd->addr);
+		break;
+#endif
+	}
+
+	return 0;
+}
+
+long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
+			uint64_t data, int count, int fd)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
+	int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
+	struct vfio_pci_ioeventfd *ioeventfd;
+
+	/* Only support ioeventfds into BARs */
+	if (bar > VFIO_PCI_BAR5_REGION_INDEX)
+		return -EINVAL;
+
+	if (pos + count > pci_resource_len(pdev, bar))
+		return -EINVAL;
+
+	/* Disallow ioeventfds working around MSI-X table writes */
+	if (bar == vdev->msix_bar &&
+	    !(pos + count <= vdev->msix_offset ||
+	      pos >= vdev->msix_offset + vdev->msix_size))
+		return -EINVAL;
+
+#ifndef iowrite64
+	if (count == 8)
+		return -EINVAL;
+#endif
+
+	ret = vfio_pci_setup_barmap(vdev, bar);
+	if (ret)
+		return ret;
+
+	mutex_lock(&vdev->ioeventfds_lock);
+
+	list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
+		if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
+		    ioeventfd->data == data && ioeventfd->count == count) {
+			if (fd == -1) {
+				vfio_virqfd_disable(&ioeventfd->virqfd);
+				list_del(&ioeventfd->next);
+				vdev->ioeventfds_nr--;
+				kfree(ioeventfd);
+				ret = 0;
+			} else
+				ret = -EEXIST;
+
+			goto out_unlock;
+		}
+	}
+
+	if (fd < 0) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) {
+		ret = -ENOSPC;
+		goto out_unlock;
+	}
+
+	ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
+	if (!ioeventfd) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	ioeventfd->addr = vdev->barmap[bar] + pos;
+	ioeventfd->data = data;
+	ioeventfd->pos = pos;
+	ioeventfd->bar = bar;
+	ioeventfd->count = count;
+
+	ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
+				 NULL, NULL, &ioeventfd->virqfd, fd);
+	if (ret) {
+		kfree(ioeventfd);
+		goto out_unlock;
+	}
+
+	list_add(&ioeventfd->next, &vdev->ioeventfds_list);
+	vdev->ioeventfds_nr++;
+
+out_unlock:
+	mutex_unlock(&vdev->ioeventfds_lock);
+
+	return ret;
+}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index c74372163ed2..1aa7b82e8169 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -575,6 +575,33 @@ struct vfio_device_gfx_plane_info {
 
 #define VFIO_DEVICE_GET_GFX_DMABUF _IO(VFIO_TYPE, VFIO_BASE + 15)
 
+/**
+ * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 16,
+ *                              struct vfio_device_ioeventfd)
+ *
+ * Perform a write to the device at the specified device fd offset, with
+ * the specified data and width when the provided eventfd is triggered.
+ * vfio bus drivers may not support this for all regions, for all widths,
+ * or at all.  vfio-pci currently only enables support for BAR regions,
+ * excluding the MSI-X vector table.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_ioeventfd {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DEVICE_IOEVENTFD_8		(1 << 0) /* 1-byte write */
+#define VFIO_DEVICE_IOEVENTFD_16	(1 << 1) /* 2-byte write */
+#define VFIO_DEVICE_IOEVENTFD_32	(1 << 2) /* 4-byte write */
+#define VFIO_DEVICE_IOEVENTFD_64	(1 << 3) /* 8-byte write */
+#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK	(0xf)
+	__u64	offset;			/* device fd offset of write */
+	__u64	data;			/* data to be written */
+	__s32	fd;			/* -1 for de-assignment */
+};
+
+#define VFIO_DEVICE_IOEVENTFD		_IO(VFIO_TYPE, VFIO_BASE + 16)
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**

From da9147140fe3de5a3a3fe5fe7f69739d4f39bea1 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 26 Mar 2018 13:36:55 -0600
Subject: [PATCH 7/7] MAINTAINERS: vfio/platform: Update sub-maintainer

Baptiste has changed positions and has not been active with
vfio-platform, replace with the current, de-facto sub-maintainer
Eric Auger.

Acked-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 205c8fc12a9c..683d1988ecbb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14687,7 +14687,7 @@ F:	include/linux/mdev.h
 F:	samples/vfio-mdev/
 
 VFIO PLATFORM DRIVER
-M:	Baptiste Reynal <b.reynal@virtualopensystems.com>
+M:	Eric Auger <eric.auger@redhat.com>
 L:	kvm@vger.kernel.org
 S:	Maintained
 F:	drivers/vfio/platform/