From 0b934ccd707ff33a87f15a35a9916d1d8e85d30e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 4 Nov 2011 15:41:27 -0400
Subject: [PATCH 001/105] Xen: Export xen_biovec_phys_mergeable

When Xen is enabled, using BIOVEC_PHYS_MERGEABLE in a module
causes xen_biovec_phys_mergeable to be referenced, so it needs
to be exported.

Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/xen/biomerge.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
index ba6eda4b5143..18c1bb6ffce3 100644
--- a/drivers/xen/biomerge.c
+++ b/drivers/xen/biomerge.c
@@ -11,3 +11,4 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 	return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
 		((mfn1 == mfn2) || ((mfn1+1) == mfn2));
 }
+EXPORT_SYMBOL(xen_biovec_phys_mergeable);

From b60503ba432b16fc84442a84e29a7aad2c0c363d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 12:50:14 -0500
Subject: [PATCH 002/105] NVMe: New driver

This driver is for devices that follow the NVM Express standard

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 Documentation/ioctl/ioctl-number.txt |    1 +
 drivers/block/Kconfig                |   11 +
 drivers/block/Makefile               |    1 +
 drivers/block/nvme.c                 | 1043 ++++++++++++++++++++++++++
 include/linux/nvme.h                 |  343 +++++++++
 5 files changed, 1399 insertions(+)
 create mode 100644 drivers/block/nvme.c
 create mode 100644 include/linux/nvme.h

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 54078ed96b37..4840334ea97b 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -149,6 +149,7 @@ Code  Seq#(hex)	Include File		Comments
 'M'	01-03	drivers/scsi/megaraid/megaraid_sas.h
 'M'	00-0F	drivers/video/fsl-diu-fb.h	conflict!
 'N'	00-1F	drivers/usb/scanner.h
+'N'	40-7F	drivers/block/nvme.c
 'O'     00-06   mtd/ubi-user.h		UBI
 'P'	all	linux/soundcard.h	conflict!
 'P'	60-6F	sound/sscape_ioctl.h	conflict!
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 6f07ec1c2f58..35e56e1c948f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -315,6 +315,17 @@ config BLK_DEV_NBD
 
 	  If unsure, say N.
 
+config BLK_DEV_NVME
+	tristate "NVM Express block device"
+	depends on PCI
+	---help---
+	  The NVM Express driver is for solid state drives directly
+	  connected to the PCI or PCI Express bus.  If you know you
+	  don't have one of these, it is safe to answer N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called nvme.
+
 config BLK_DEV_OSD
 	tristate "OSD object-as-blkdev support"
 	depends on SCSI_OSD_ULD
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 76646e9a1c91..349539ad3ad9 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
+obj-$(CONFIG_BLK_DEV_NVME)	+= nvme.o
 obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
new file mode 100644
index 000000000000..ef66eccc2aa2
--- /dev/null
+++ b/drivers/block/nvme.c
@@ -0,0 +1,1043 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/nvme.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kdev_t.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/version.h>
+
+#define NVME_Q_DEPTH 1024
+#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
+#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
+#define NVME_MINORS 64
+
+static int nvme_major;
+module_param(nvme_major, int, 0);
+
+/*
+ * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+ */
+struct nvme_dev {
+	struct list_head node;
+	struct nvme_queue **queues;
+	u32 __iomem *dbs;
+	struct pci_dev *pci_dev;
+	int instance;
+	int queue_count;
+	u32 ctrl_config;
+	struct msix_entry *entry;
+	struct nvme_bar __iomem *bar;
+	struct list_head namespaces;
+};
+
+/*
+ * An NVM Express namespace is equivalent to a SCSI LUN
+ */
+struct nvme_ns {
+	struct list_head list;
+
+	struct nvme_dev *dev;
+	struct request_queue *queue;
+	struct gendisk *disk;
+
+	int ns_id;
+	int lba_shift;
+};
+
+/*
+ * An NVM Express queue.  Each device has at least two (one for admin
+ * commands and one for I/O commands).
+ */
+struct nvme_queue {
+	struct device *q_dmadev;
+	spinlock_t q_lock;
+	struct nvme_command *sq_cmds;
+	volatile struct nvme_completion *cqes;
+	dma_addr_t sq_dma_addr;
+	dma_addr_t cq_dma_addr;
+	wait_queue_head_t sq_full;
+	struct bio_list sq_cong;
+	u32 __iomem *q_db;
+	u16 q_depth;
+	u16 cq_vector;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 cq_head;
+	u16 cq_cycle;
+	unsigned long cmdid_data[];
+};
+
+/*
+ * Check we didin't inadvertently grow the command struct
+ */
+static inline void _nvme_check_size(void)
+{
+	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
+	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
+}
+
+/**
+ * alloc_cmdid - Allocate a Command ID
+ * @param nvmeq The queue that will be used for this command
+ * @param ctx A pointer that will be passed to the handler
+ * @param handler The ID of the handler to call
+ *
+ * Allocate a Command ID for a queue.  The data passed in will
+ * be passed to the completion handler.  This is implemented by using
+ * the bottom two bits of the ctx pointer to store the handler ID.
+ * Passing in a pointer that's not 4-byte aligned will cause a BUG.
+ * We can change this if it becomes a problem.
+ */
+static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler)
+{
+	int depth = nvmeq->q_depth;
+	unsigned long data = (unsigned long)ctx | handler;
+	int cmdid;
+
+	BUG_ON((unsigned long)ctx & 3);
+
+	do {
+		cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
+		if (cmdid >= depth)
+			return -EBUSY;
+	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
+
+	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(depth)] = data;
+	return cmdid;
+}
+
+static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
+								int handler)
+{
+	int cmdid;
+	wait_event_killable(nvmeq->sq_full,
+			(cmdid = alloc_cmdid(nvmeq, ctx, handler)) >= 0);
+	return (cmdid < 0) ? -EINTR : cmdid;
+}
+
+/* If you need more than four handlers, you'll need to change how
+ * alloc_cmdid and nvme_process_cq work
+ */
+enum {
+	sync_completion_id = 0,
+	bio_completion_id,
+};
+
+static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
+{
+	unsigned long data;
+
+	data = nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)];
+	clear_bit(cmdid, nvmeq->cmdid_data);
+	wake_up(&nvmeq->sq_full);
+	return data;
+}
+
+static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
+{
+	return ns->dev->queues[1];
+}
+
+static void put_nvmeq(struct nvme_queue *nvmeq)
+{
+}
+
+/**
+ * nvme_submit_cmd: Copy a command into a queue and ring the doorbell
+ * @nvmeq: The queue to use
+ * @cmd: The command to send
+ *
+ * Safe to use from interrupt context
+ */
+static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+{
+	unsigned long flags;
+	u16 tail;
+	/* XXX: Need to check tail isn't going to overrun head */
+	spin_lock_irqsave(&nvmeq->q_lock, flags);
+	tail = nvmeq->sq_tail;
+	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+	writel(tail, nvmeq->q_db);
+	if (++tail == nvmeq->q_depth)
+		tail = 0;
+	nvmeq->sq_tail = tail;
+	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+
+	return 0;
+}
+
+struct nvme_req_info {
+	struct bio *bio;
+	int nents;
+	struct scatterlist sg[0];
+};
+
+/* XXX: use a mempool */
+static struct nvme_req_info *alloc_info(unsigned nseg, gfp_t gfp)
+{
+	return kmalloc(sizeof(struct nvme_req_info) +
+			sizeof(struct scatterlist) * nseg, gfp);
+}
+
+static void free_info(struct nvme_req_info *info)
+{
+	kfree(info);
+}
+
+static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct nvme_req_info *info = ctx;
+	struct bio *bio = info->bio;
+	u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+	dma_unmap_sg(nvmeq->q_dmadev, info->sg, info->nents,
+			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	free_info(info);
+	bio_endio(bio, status ? -EIO : 0);
+}
+
+static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
+		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
+{
+	struct bio_vec *bvec;
+	struct scatterlist *sg = info->sg;
+	int i, nsegs;
+
+	sg_init_table(sg, psegs);
+	bio_for_each_segment(bvec, bio, i) {
+		sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
+		/* XXX: handle non-mergable here */
+		nsegs++;
+	}
+	info->nents = nsegs;
+
+	return dma_map_sg(dev, info->sg, info->nents, dma_dir);
+}
+
+static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+								struct bio *bio)
+{
+	struct nvme_rw_command *cmnd;
+	struct nvme_req_info *info;
+	enum dma_data_direction dma_dir;
+	int cmdid;
+	u16 control;
+	u32 dsmgmt;
+	unsigned long flags;
+	int psegs = bio_phys_segments(ns->queue, bio);
+
+	info = alloc_info(psegs, GFP_NOIO);
+	if (!info)
+		goto congestion;
+	info->bio = bio;
+
+	cmdid = alloc_cmdid(nvmeq, info, bio_completion_id);
+	if (unlikely(cmdid < 0))
+		goto free_info;
+
+	control = 0;
+	if (bio->bi_rw & REQ_FUA)
+		control |= NVME_RW_FUA;
+	if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+		control |= NVME_RW_LR;
+
+	dsmgmt = 0;
+	if (bio->bi_rw & REQ_RAHEAD)
+		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+
+	spin_lock_irqsave(&nvmeq->q_lock, flags);
+	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail].rw;
+
+	if (bio_data_dir(bio)) {
+		cmnd->opcode = nvme_cmd_write;
+		dma_dir = DMA_TO_DEVICE;
+	} else {
+		cmnd->opcode = nvme_cmd_read;
+		dma_dir = DMA_FROM_DEVICE;
+	}
+
+	nvme_map_bio(nvmeq->q_dmadev, info, bio, dma_dir, psegs);
+
+	cmnd->flags = 1;
+	cmnd->command_id = cmdid;
+	cmnd->nsid = cpu_to_le32(ns->ns_id);
+	cmnd->prp1 = cpu_to_le64(sg_phys(info->sg));
+	/* XXX: Support more than one PRP */
+	cmnd->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
+	cmnd->length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
+	cmnd->control = cpu_to_le16(control);
+	cmnd->dsmgmt = cpu_to_le32(dsmgmt);
+
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+
+	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+
+	return 0;
+
+ free_info:
+	free_info(info);
+ congestion:
+	return -EBUSY;
+}
+
+/*
+ * NB: return value of non-zero would mean that we were a stacking driver.
+ * make_request must always succeed.
+ */
+static int nvme_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct nvme_ns *ns = q->queuedata;
+	struct nvme_queue *nvmeq = get_nvmeq(ns);
+
+	if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
+		blk_set_queue_congested(q, rw_is_sync(bio->bi_rw));
+		bio_list_add(&nvmeq->sq_cong, bio);
+	}
+	put_nvmeq(nvmeq);
+
+	return 0;
+}
+
+struct sync_cmd_info {
+	struct task_struct *task;
+	u32 result;
+	int status;
+};
+
+static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct sync_cmd_info *cmdinfo = ctx;
+	cmdinfo->result = le32_to_cpup(&cqe->result);
+	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+	wake_up_process(cmdinfo->task);
+}
+
+typedef void (*completion_fn)(struct nvme_queue *, void *,
+						struct nvme_completion *);
+
+static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
+{
+	u16 head, cycle;
+
+	static const completion_fn completions[4] = {
+		[sync_completion_id] = sync_completion,
+		[bio_completion_id]  = bio_completion,
+	};
+
+	head = nvmeq->cq_head;
+	cycle = nvmeq->cq_cycle;
+
+	for (;;) {
+		unsigned long data;
+		void *ptr;
+		unsigned char handler;
+		struct nvme_completion cqe = nvmeq->cqes[head];
+		if ((le16_to_cpu(cqe.status) & 1) != cycle)
+			break;
+		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
+		if (++head == nvmeq->q_depth) {
+			head = 0;
+			cycle = !cycle;
+		}
+
+		data = free_cmdid(nvmeq, cqe.command_id);
+		handler = data & 3;
+		ptr = (void *)(data & ~3UL);
+		completions[handler](nvmeq, ptr, &cqe);
+	}
+
+	/* If the controller ignores the cq head doorbell and continuously
+	 * writes to the queue, it is theoretically possible to wrap around
+	 * the queue twice and mistakenly return IRQ_NONE.  Linux only
+	 * requires that 0.1% of your interrupts are handled, so this isn't
+	 * a big problem.
+	 */
+	if (head == nvmeq->cq_head && cycle == nvmeq->cq_cycle)
+		return IRQ_NONE;
+
+	writel(head, nvmeq->q_db + 1);
+	nvmeq->cq_head = head;
+	nvmeq->cq_cycle = cycle;
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t nvme_irq(int irq, void *data)
+{
+	return nvme_process_cq(data);
+}
+
+/*
+ * Returns 0 on success.  If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+static int nvme_submit_sync_cmd(struct nvme_queue *q, struct nvme_command *cmd,
+								u32 *result)
+{
+	int cmdid;
+	struct sync_cmd_info cmdinfo;
+
+	cmdinfo.task = current;
+	cmdinfo.status = -EINTR;
+
+	cmdid = alloc_cmdid_killable(q, &cmdinfo, sync_completion_id);
+	if (cmdid < 0)
+		return cmdid;
+	cmd->common.command_id = cmdid;
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	nvme_submit_cmd(q, cmd);
+	schedule();
+
+	if (result)
+		*result = cmdinfo.result;
+
+	return cmdinfo.status;
+}
+
+static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+								u32 *result)
+{
+	return nvme_submit_sync_cmd(dev->queues[0], cmd, result);
+}
+
+static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
+{
+	int status;
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.delete_queue.opcode = opcode;
+	c.delete_queue.qid = cpu_to_le16(id);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	int status;
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+
+	memset(&c, 0, sizeof(c));
+	c.create_cq.opcode = nvme_admin_create_cq;
+	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
+	c.create_cq.cqid = cpu_to_le16(qid);
+	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_cq.cq_flags = cpu_to_le16(flags);
+	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	int status;
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+
+	memset(&c, 0, sizeof(c));
+	c.create_sq.opcode = nvme_admin_create_sq;
+	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
+	c.create_sq.sqid = cpu_to_le16(qid);
+	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_sq.sq_flags = cpu_to_le16(flags);
+	c.create_sq.cqid = cpu_to_le16(qid);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
+}
+
+static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
+}
+
+static void nvme_free_queue(struct nvme_dev *dev, int qid)
+{
+	struct nvme_queue *nvmeq = dev->queues[qid];
+
+	free_irq(dev->entry[nvmeq->cq_vector].vector, nvmeq);
+
+	/* Don't tell the adapter to delete the admin queue */
+	if (qid) {
+		adapter_delete_sq(dev, qid);
+		adapter_delete_cq(dev, qid);
+	}
+
+	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	kfree(nvmeq);
+}
+
+static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
+							int depth, int vector)
+{
+	struct device *dmadev = &dev->pci_dev->dev;
+	unsigned extra = (depth + BITS_TO_LONGS(depth)) * sizeof(long);
+	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+	if (!nvmeq)
+		return NULL;
+
+	nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
+					&nvmeq->cq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->cqes)
+		goto free_nvmeq;
+	memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
+
+	nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
+					&nvmeq->sq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->sq_cmds)
+		goto free_cqdma;
+
+	nvmeq->q_dmadev = dmadev;
+	spin_lock_init(&nvmeq->q_lock);
+	nvmeq->cq_head = 0;
+	nvmeq->cq_cycle = 1;
+	init_waitqueue_head(&nvmeq->sq_full);
+	bio_list_init(&nvmeq->sq_cong);
+	nvmeq->q_db = &dev->dbs[qid * 2];
+	nvmeq->q_depth = depth;
+	nvmeq->cq_vector = vector;
+
+	return nvmeq;
+
+ free_cqdma:
+	dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
+							nvmeq->cq_dma_addr);
+ free_nvmeq:
+	kfree(nvmeq);
+	return NULL;
+}
+
+static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
+					int qid, int cq_size, int vector)
+{
+	int result;
+	struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
+
+	result = adapter_alloc_cq(dev, qid, nvmeq);
+	if (result < 0)
+		goto free_nvmeq;
+
+	result = adapter_alloc_sq(dev, qid, nvmeq);
+	if (result < 0)
+		goto release_cq;
+
+	result = request_irq(dev->entry[vector].vector, nvme_irq,
+				IRQF_DISABLED | IRQF_SHARED, "nvme", nvmeq);
+	if (result < 0)
+		goto release_sq;
+
+	return nvmeq;
+
+ release_sq:
+	adapter_delete_sq(dev, qid);
+ release_cq:
+	adapter_delete_cq(dev, qid);
+ free_nvmeq:
+	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	kfree(nvmeq);
+	return NULL;
+}
+
+static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
+{
+	int result;
+	u32 aqa;
+	struct nvme_queue *nvmeq;
+
+	dev->dbs = ((void __iomem *)dev->bar) + 4096;
+
+	nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+
+	aqa = nvmeq->q_depth - 1;
+	aqa |= aqa << 16;
+
+	dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
+	dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
+	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+
+	writel(aqa, &dev->bar->aqa);
+	writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
+	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
+	writel(dev->ctrl_config, &dev->bar->cc);
+
+	while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+	}
+
+	result = request_irq(dev->entry[0].vector, nvme_irq,
+			IRQF_DISABLED | IRQF_SHARED, "nvme admin", nvmeq);
+	dev->queues[0] = nvmeq;
+	return result;
+}
+
+static int nvme_identify(struct nvme_ns *ns, void __user *addr, int cns)
+{
+	struct nvme_dev *dev = ns->dev;
+	int status;
+	struct nvme_command c;
+	void *page;
+	dma_addr_t dma_addr;
+
+	page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
+								GFP_KERNEL);
+
+	memset(&c, 0, sizeof(c));
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
+	c.identify.prp1 = cpu_to_le64(dma_addr);
+	c.identify.cns = cpu_to_le32(cns);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+
+	if (status)
+		status = -EIO;
+	else if (copy_to_user(addr, page, 4096))
+		status = -EFAULT;
+
+	dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
+
+	return status;
+}
+
+static int nvme_get_range_type(struct nvme_ns *ns, void __user *addr)
+{
+	struct nvme_dev *dev = ns->dev;
+	int status;
+	struct nvme_command c;
+	void *page;
+	dma_addr_t dma_addr;
+
+	page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
+								GFP_KERNEL);
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_get_features;
+	c.features.nsid = cpu_to_le32(ns->ns_id);
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+
+	/* XXX: Assuming first range for now */
+	if (status)
+		status = -EIO;
+	else if (copy_to_user(addr, page, 64))
+		status = -EFAULT;
+
+	dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
+
+	return status;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+							unsigned long arg)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case NVME_IOCTL_IDENTIFY_NS:
+		return nvme_identify(ns, (void __user *)arg, 0);
+	case NVME_IOCTL_IDENTIFY_CTRL:
+		return nvme_identify(ns, (void __user *)arg, 1);
+	case NVME_IOCTL_GET_RANGE_TYPE:
+		return nvme_get_range_type(ns, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct block_device_operations nvme_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= nvme_ioctl,
+};
+
+static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
+			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
+{
+	struct nvme_ns *ns;
+	struct gendisk *disk;
+	int lbaf;
+
+	if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
+		return NULL;
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns)
+		return NULL;
+	ns->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!ns->queue)
+		goto out_free_ns;
+	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT | QUEUE_FLAG_NOMERGES |
+				QUEUE_FLAG_NONROT | QUEUE_FLAG_DISCARD;
+	blk_queue_make_request(ns->queue, nvme_make_request);
+	ns->dev = dev;
+	ns->queue->queuedata = ns;
+
+	disk = alloc_disk(NVME_MINORS);
+	if (!disk)
+		goto out_free_queue;
+	ns->ns_id = index;
+	ns->disk = disk;
+	lbaf = id->flbas & 0xf;
+	ns->lba_shift = id->lbaf[lbaf].ds;
+
+	disk->major = nvme_major;
+	disk->minors = NVME_MINORS;
+	disk->first_minor = NVME_MINORS * index;
+	disk->fops = &nvme_fops;
+	disk->private_data = ns;
+	disk->queue = ns->queue;
+	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, index);
+	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+	return ns;
+
+ out_free_queue:
+	blk_cleanup_queue(ns->queue);
+ out_free_ns:
+	kfree(ns);
+	return NULL;
+}
+
+static void nvme_ns_free(struct nvme_ns *ns)
+{
+	put_disk(ns->disk);
+	blk_cleanup_queue(ns->queue);
+	kfree(ns);
+}
+
+static int set_queue_count(struct nvme_dev *dev, int sq_count, int cq_count)
+{
+	int status;
+	u32 result;
+	struct nvme_command c;
+	u32 q_count = (sq_count - 1) | ((cq_count - 1) << 16);
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_get_features;
+	c.features.fid = cpu_to_le32(NVME_FEAT_NUM_QUEUES);
+	c.features.dword11 = cpu_to_le32(q_count);
+
+	status = nvme_submit_admin_cmd(dev, &c, &result);
+	if (status)
+		return -EIO;
+	return min(result & 0xffff, result >> 16) + 1;
+}
+
+/* XXX: Create per-CPU queues */
+static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
+{
+	int this_cpu;
+
+	set_queue_count(dev, 1, 1);
+
+	this_cpu = get_cpu();
+	dev->queues[1] = nvme_create_queue(dev, 1, NVME_Q_DEPTH, this_cpu);
+	put_cpu();
+	if (!dev->queues[1])
+		return -ENOMEM;
+	dev->queue_count++;
+
+	return 0;
+}
+
+static void nvme_free_queues(struct nvme_dev *dev)
+{
+	int i;
+
+	for (i = dev->queue_count - 1; i >= 0; i--)
+		nvme_free_queue(dev, i);
+}
+
+static int __devinit nvme_dev_add(struct nvme_dev *dev)
+{
+	int res, nn, i;
+	struct nvme_ns *ns, *next;
+	void *id;
+	dma_addr_t dma_addr;
+	struct nvme_command cid, crt;
+
+	res = nvme_setup_io_queues(dev);
+	if (res)
+		return res;
+
+	/* XXX: Switch to a SG list once prp2 works */
+	id = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
+								GFP_KERNEL);
+
+	memset(&cid, 0, sizeof(cid));
+	cid.identify.opcode = nvme_admin_identify;
+	cid.identify.nsid = 0;
+	cid.identify.prp1 = cpu_to_le64(dma_addr);
+	cid.identify.cns = cpu_to_le32(1);
+
+	res = nvme_submit_admin_cmd(dev, &cid, NULL);
+	if (res) {
+		res = -EIO;
+		goto out_free;
+	}
+
+	nn = le32_to_cpup(&((struct nvme_id_ctrl *)id)->nn);
+
+	cid.identify.cns = 0;
+	memset(&crt, 0, sizeof(crt));
+	crt.features.opcode = nvme_admin_get_features;
+	crt.features.prp1 = cpu_to_le64(dma_addr + 4096);
+	crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
+
+	for (i = 0; i < nn; i++) {
+		cid.identify.nsid = cpu_to_le32(i);
+		res = nvme_submit_admin_cmd(dev, &cid, NULL);
+		if (res)
+			continue;
+
+		if (((struct nvme_id_ns *)id)->ncap == 0)
+			continue;
+
+		crt.features.nsid = cpu_to_le32(i);
+		res = nvme_submit_admin_cmd(dev, &crt, NULL);
+		if (res)
+			continue;
+
+		ns = nvme_alloc_ns(dev, i, id, id + 4096);
+		if (ns)
+			list_add_tail(&ns->list, &dev->namespaces);
+	}
+	list_for_each_entry(ns, &dev->namespaces, list)
+		add_disk(ns->disk);
+
+	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	return 0;
+
+ out_free:
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+		list_del(&ns->list);
+		nvme_ns_free(ns);
+	}
+
+	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	return res;
+}
+
+static int nvme_dev_remove(struct nvme_dev *dev)
+{
+	struct nvme_ns *ns, *next;
+
+	/* TODO: wait all I/O finished or cancel them */
+
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+		list_del(&ns->list);
+		del_gendisk(ns->disk);
+		nvme_ns_free(ns);
+	}
+
+	nvme_free_queues(dev);
+
+	return 0;
+}
+
+/* XXX: Use an ida or something to let remove / add work correctly */
+static void nvme_set_instance(struct nvme_dev *dev)
+{
+	static int instance;
+	dev->instance = instance++;
+}
+
+static void nvme_release_instance(struct nvme_dev *dev)
+{
+}
+
+static int __devinit nvme_probe(struct pci_dev *pdev,
+						const struct pci_device_id *id)
+{
+	int result = -ENOMEM;
+	struct nvme_dev *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
+								GFP_KERNEL);
+	if (!dev->entry)
+		goto free;
+	dev->queues = kcalloc(2, sizeof(void *), GFP_KERNEL);
+	if (!dev->queues)
+		goto free;
+
+	INIT_LIST_HEAD(&dev->namespaces);
+	dev->pci_dev = pdev;
+	pci_set_drvdata(pdev, dev);
+	dma_set_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
+	nvme_set_instance(dev);
+
+	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+	if (!dev->bar) {
+		result = -ENOMEM;
+		goto disable;
+	}
+
+	result = nvme_configure_admin_queue(dev);
+	if (result)
+		goto unmap;
+	dev->queue_count++;
+
+	result = nvme_dev_add(dev);
+	if (result)
+		goto delete;
+	return 0;
+
+ delete:
+	nvme_free_queues(dev);
+ unmap:
+	iounmap(dev->bar);
+ disable:
+	pci_disable_msix(pdev);
+	nvme_release_instance(dev);
+ free:
+	kfree(dev->queues);
+	kfree(dev->entry);
+	kfree(dev);
+	return result;
+}
+
+static void __devexit nvme_remove(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+	nvme_dev_remove(dev);
+	pci_disable_msix(pdev);
+	iounmap(dev->bar);
+	nvme_release_instance(dev);
+	kfree(dev->queues);
+	kfree(dev->entry);
+	kfree(dev);
+}
+
+/* These functions are yet to be implemented */
+#define nvme_error_detected NULL
+#define nvme_dump_registers NULL
+#define nvme_link_reset NULL
+#define nvme_slot_reset NULL
+#define nvme_error_resume NULL
+#define nvme_suspend NULL
+#define nvme_resume NULL
+
+static struct pci_error_handlers nvme_err_handler = {
+	.error_detected	= nvme_error_detected,
+	.mmio_enabled	= nvme_dump_registers,
+	.link_reset	= nvme_link_reset,
+	.slot_reset	= nvme_slot_reset,
+	.resume		= nvme_error_resume,
+};
+
+/* Move to pci_ids.h later */
+#define PCI_CLASS_STORAGE_EXPRESS	0x010802
+
+static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
+	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, nvme_id_table);
+
+static struct pci_driver nvme_driver = {
+	.name		= "nvme",
+	.id_table	= nvme_id_table,
+	.probe		= nvme_probe,
+	.remove		= __devexit_p(nvme_remove),
+	.suspend	= nvme_suspend,
+	.resume		= nvme_resume,
+	.err_handler	= &nvme_err_handler,
+};
+
+static int __init nvme_init(void)
+{
+	int result;
+
+	nvme_major = register_blkdev(nvme_major, "nvme");
+	if (nvme_major <= 0)
+		return -EBUSY;
+
+	result = pci_register_driver(&nvme_driver);
+	if (!result)
+		return 0;
+
+	unregister_blkdev(nvme_major, "nvme");
+	return result;
+}
+
+static void __exit nvme_exit(void)
+{
+	pci_unregister_driver(&nvme_driver);
+	unregister_blkdev(nvme_major, "nvme");
+}
+
+MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.1");
+module_init(nvme_init);
+module_exit(nvme_exit);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
new file mode 100644
index 000000000000..9ba53584f722
--- /dev/null
+++ b/include/linux/nvme.h
@@ -0,0 +1,343 @@
+/*
+ * Definitions for the NVM Express interface
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _LINUX_NVME_H
+#define _LINUX_NVME_H
+
+#include <linux/types.h>
+
+struct nvme_bar {
+	__u64			cap;	/* Controller Capabilities */
+	__u32			vs;	/* Version */
+	__u32			ims;	/* Interrupt Mask Set */
+	__u32			imc;	/* Interrupt Mask Clear */
+	__u32			cc;	/* Controller Configuration */
+	__u32			csts;	/* Controller Status */
+	__u32			aqa;	/* Admin Queue Attributes */
+	__u64			asq;	/* Admin SQ Base Address */
+	__u64			acq;	/* Admin CQ Base Address */
+};
+
+enum {
+	NVME_CC_ENABLE		= 1 << 0,
+	NVME_CC_CSS_NVM		= 0 << 4,
+	NVME_CC_MPS_SHIFT	= 7,
+	NVME_CC_ARB_RR		= 0 << 11,
+	NVME_CC_ARB_WRRU	= 1 << 11,
+	NVME_CC_ARB_VS		= 3 << 11,
+	NVME_CC_SHN_NONE	= 0 << 13,
+	NVME_CC_SHN_NORMAL	= 1 << 13,
+	NVME_CC_SHN_ABRUPT	= 2 << 13,
+	NVME_CSTS_RDY		= 1 << 0,
+	NVME_CSTS_CFS		= 1 << 1,
+	NVME_CSTS_SHST_NORMAL	= 0 << 2,
+	NVME_CSTS_SHST_OCCUR	= 1 << 2,
+	NVME_CSTS_SHST_CMPLT	= 2 << 2,
+};
+
+#define NVME_VS(major, minor)	(major << 16 | minor)
+
+struct nvme_id_ctrl {
+	__le16			vid;
+	__le16			ssvid;
+	char			sn[20];
+	char			mn[40];
+	char			fr[8];
+	__le32			nn;
+	__u8			rab;
+	__u8			rsvd77[178];
+	__le16			oacs;
+	__u8			acl;
+	__u8			aerl;
+	__u8			frmw;
+	__u8			lpa;
+	__u8			elpe;
+	__u8			npss;
+	__u8			rsvd264[248];
+	__le64			psd[32];
+	__le16			oncs;
+	__le16			fuses;
+	__u8			fna;
+	__u8			vwc;
+	__le16			awun;
+	__le16			awupf;
+	__u8			rsvd778[246];
+	__u8			cmdset[2048];
+	__u8			vs[1024];
+};
+
+struct nvme_lbaf {
+	__le16			ms;
+	__u8			ds;
+	__u8			rp;
+};
+
+struct nvme_id_ns {
+	__le64			nsze;
+	__le64			ncap;
+	__le64			nuse;
+	__u8			nsfeat;
+	__u8			nlbaf;
+	__u8			flbas;
+	__u8			mc;
+	__u8			dpc;
+	__u8			dps;
+	__u8			rsvd30[98];
+	struct nvme_lbaf	lbaf[16];
+	__u8			rsvd192[192];
+	__u8			vs[3712];
+};
+
+enum {
+	NVME_NS_FEAT_THIN	= 1 << 0,
+	NVME_LBAF_RP_BEST	= 0,
+	NVME_LBAF_RP_BETTER	= 1,
+	NVME_LBAF_RP_GOOD	= 2,
+	NVME_LBAF_RP_DEGRADED	= 3,
+};
+
+struct nvme_lba_range_type {
+	__u8			type;
+	__u8			attributes;
+	__u8			rsvd2[14];
+	__u64			slba;
+	__u64			nlb;
+	__u8			guid[16];
+	__u8			rsvd48[16];
+};
+
+enum {
+	NVME_LBART_TYPE_FS	= 0x01,
+	NVME_LBART_TYPE_RAID	= 0x02,
+	NVME_LBART_TYPE_CACHE	= 0x03,
+	NVME_LBART_TYPE_SWAP	= 0x04,
+
+	NVME_LBART_ATTRIB_TEMP	= 1 << 0,
+	NVME_LBART_ATTRIB_HIDE	= 1 << 1,
+};
+
+/* I/O commands */
+
+enum nvme_opcode {
+	nvme_cmd_flush		= 0x00,
+	nvme_cmd_write		= 0x01,
+	nvme_cmd_read		= 0x02,
+	nvme_cmd_write_uncor	= 0x04,
+	nvme_cmd_compare	= 0x05,
+	nvme_cmd_dsm		= 0x09,
+};
+
+struct nvme_rw_command {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2;
+	__le64			metadata;
+	__le64			prp1;
+	__le64			prp2;
+	__le64			slba;
+	__le16			length;
+	__le16			control;
+	__le32			dsmgmt;
+	__le32			reftag;
+	__le16			apptag;
+	__le16			appmask;
+};
+
+enum {
+	NVME_RW_LR			= 1 << 15,
+	NVME_RW_FUA			= 1 << 14,
+	NVME_RW_DSM_FREQ_UNSPEC		= 0,
+	NVME_RW_DSM_FREQ_TYPICAL	= 1,
+	NVME_RW_DSM_FREQ_RARE		= 2,
+	NVME_RW_DSM_FREQ_READS		= 3,
+	NVME_RW_DSM_FREQ_WRITES		= 4,
+	NVME_RW_DSM_FREQ_RW		= 5,
+	NVME_RW_DSM_FREQ_ONCE		= 6,
+	NVME_RW_DSM_FREQ_PREFETCH	= 7,
+	NVME_RW_DSM_FREQ_TEMP		= 8,
+	NVME_RW_DSM_LATENCY_NONE	= 0 << 4,
+	NVME_RW_DSM_LATENCY_IDLE	= 1 << 4,
+	NVME_RW_DSM_LATENCY_NORM	= 2 << 4,
+	NVME_RW_DSM_LATENCY_LOW		= 3 << 4,
+	NVME_RW_DSM_SEQ_REQ		= 1 << 6,
+	NVME_RW_DSM_COMPRESSED		= 1 << 7,
+};
+
+/* Admin commands */
+
+enum nvme_admin_opcode {
+	nvme_admin_delete_sq		= 0x00,
+	nvme_admin_create_sq		= 0x01,
+	nvme_admin_get_features		= 0x02,
+	nvme_admin_delete_cq		= 0x04,
+	nvme_admin_create_cq		= 0x05,
+	nvme_admin_identify		= 0x06,
+	nvme_admin_abort_cmd		= 0x08,
+	nvme_admin_set_features		= 0x09,
+	nvme_admin_get_log_page		= 0x0a,
+	nvme_admin_async_event		= 0x0c,
+	nvme_admin_download_fw		= 0x0d,
+	nvme_admin_security_recv	= 0x0e,
+	nvme_admin_format_nvm		= 0x10,
+	nvme_admin_security_send	= 0x11,
+	nvme_admin_activate_fw		= 0x14,
+};
+
+enum {
+	NVME_QUEUE_PHYS_CONTIG	= (1 << 0),
+	NVME_CQ_IRQ_ENABLED	= (1 << 1),
+	NVME_SQ_PRIO_URGENT	= (0 << 1),
+	NVME_SQ_PRIO_HIGH	= (1 << 1),
+	NVME_SQ_PRIO_MEDIUM	= (2 << 1),
+	NVME_SQ_PRIO_LOW	= (3 << 1),
+	NVME_FEAT_ARBITRATION	= 0x01,
+	NVME_FEAT_POWER_MGMT	= 0x02,
+	NVME_FEAT_LBA_RANGE	= 0x03,
+	NVME_FEAT_TEMP_THRESH	= 0x04,
+	NVME_FEAT_ERR_RECOVERY	= 0x05,
+	NVME_FEAT_VOLATILE_WC	= 0x06,
+	NVME_FEAT_NUM_QUEUES	= 0x07,
+	NVME_FEAT_IRQ_COALESCE	= 0x08,
+	NVME_FEAT_IRQ_CONFIG	= 0x09,
+	NVME_FEAT_WRITE_ATOMIC	= 0x0a,
+	NVME_FEAT_ASYNC_EVENT	= 0x0b,
+	NVME_FEAT_SW_PROGRESS	= 0x0c,
+};
+
+struct nvme_identify {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[2];
+	__le64			prp1;
+	__le64			prp2;
+	__le32			cns;
+	__u32			rsvd11[5];
+};
+
+struct nvme_features {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[2];
+	__le64			prp1;
+	__le64			prp2;
+	__le32			fid;
+	__le32			dword11;
+	__u32			rsvd12[4];
+};
+
+struct nvme_create_cq {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			rsvd1[5];
+	__le64			prp1;
+	__u64			rsvd8;
+	__le16			cqid;
+	__le16			qsize;
+	__le16			cq_flags;
+	__le16			irq_vector;
+	__u32			rsvd12[4];
+};
+
+struct nvme_create_sq {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			rsvd1[5];
+	__le64			prp1;
+	__u64			rsvd8;
+	__le16			sqid;
+	__le16			qsize;
+	__le16			sq_flags;
+	__le16			cqid;
+	__le32			rsvd12[4];
+};
+
+struct nvme_delete_queue {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__u32			rsvd1[9];
+	__le16			qid;
+	__le16			rsvd10;
+	__le32			rsvd11[5];
+};
+
+struct nvme_common_command {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u32			rsvd2[14];
+};
+
+struct nvme_command {
+	union {
+		struct nvme_common_command common;
+		struct nvme_rw_command rw;
+		struct nvme_identify identify;
+		struct nvme_features features;
+		struct nvme_create_cq create_cq;
+		struct nvme_create_sq create_sq;
+		struct nvme_delete_queue delete_queue;
+	};
+};
+
+/* XXX: Sync with spec */
+enum {
+	NVME_SC_SUCCESS			= 0x0,
+	NVME_SC_INVALID_OPCODE		= 0x1,
+	NVME_SC_INVALID_FIELD		= 0x2,
+	NVME_SC_CMDID_CONFLICT		= 0x3,
+	NVME_SC_DATA_XFER_ERROR		= 0x4,
+	NVME_SC_POWER_LOSS		= 0x5,
+	NVME_SC_INTERNAL		= 0x6,
+	NVME_SC_ABORT_REQ		= 0x7,
+	NVME_SC_ABORT_QUEUE		= 0x8,
+	NVME_SC_FUSED_FAIL		= 0x9,
+	NVME_SC_FUSED_MISSING		= 0xa,
+	NVME_SC_LBA_RANGE		= 0x80,
+	NVME_SC_CAP_EXCEEDED		= 0x81,
+	NVME_SC_NS_NOT_READY		= 0x82,
+	NVME_SC_CQ_INVALID		= 0x100,
+	NVME_SC_QID_INVALID		= 0x101,
+	NVME_SC_QUEUE_SIZE		= 0x102,
+	NVME_SC_WRITE_FAULT		= 0x280,
+	NVME_SC_READ_ERROR		= 0x281,
+};
+
+struct nvme_completion {
+	__le32	result;		/* Used by admin commands to return data */
+	__le32	rsvd;
+	__le16	sq_head;	/* how much of this queue may be reclaimed */
+	__le16	sq_id;		/* submission queue that generated this entry */
+	__u16	command_id;	/* of the command which completed */
+	__le16	status;		/* did the command fail, and if so, why? */
+};
+
+#define NVME_IOCTL_IDENTIFY_NS	_IOW('N', 0x40, struct nvme_id_ns)
+#define NVME_IOCTL_IDENTIFY_CTRL _IOW('N', 0x41, struct nvme_id_ctrl)
+#define NVME_IOCTL_GET_RANGE_TYPE _IOW('N', 0x42, struct nvme_lba_range_type)
+
+#endif /* _LINUX_NVME_H */

From 3001082cac4bf6ffd09f72b39e6292ad6394ef17 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 09:10:15 -0500
Subject: [PATCH 003/105] NVMe: Factor out queue_request_irq()

Two callers with an almost identical long string of arguments, and
introducing a third soon.  Time to factor out the commonalities.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index ef66eccc2aa2..b10e064795ed 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -568,6 +568,13 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	return NULL;
 }
 
+static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+							const char *name)
+{
+	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
+				IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
+}
+
 static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 					int qid, int cq_size, int vector)
 {
@@ -582,8 +589,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 	if (result < 0)
 		goto release_cq;
 
-	result = request_irq(dev->entry[vector].vector, nvme_irq,
-				IRQF_DISABLED | IRQF_SHARED, "nvme", nvmeq);
+	result = queue_request_irq(dev, nvmeq, "nvme");
 	if (result < 0)
 		goto release_sq;
 
@@ -630,8 +636,7 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 			return -EINTR;
 	}
 
-	result = request_irq(dev->entry[0].vector, nvme_irq,
-			IRQF_DISABLED | IRQF_SHARED, "nvme admin", nvmeq);
+	result = queue_request_irq(dev, nvmeq, "nvme admin");
 	dev->queues[0] = nvmeq;
 	return result;
 }

From b3b06812e199f248561ce7824a4a8a9cd573c05a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 09:14:34 -0500
Subject: [PATCH 004/105] NVMe: Reduce set_queue_count arguments by one

sq_count and cq_count are always the same, so just call it 'count'.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b10e064795ed..7efd7e92b637 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -777,12 +777,12 @@ static void nvme_ns_free(struct nvme_ns *ns)
 	kfree(ns);
 }
 
-static int set_queue_count(struct nvme_dev *dev, int sq_count, int cq_count)
+static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
 	u32 result;
 	struct nvme_command c;
-	u32 q_count = (sq_count - 1) | ((cq_count - 1) << 16);
+	u32 q_count = (count - 1) | ((count - 1) << 16);
 
 	memset(&c, 0, sizeof(c));
 	c.features.opcode = nvme_admin_get_features;
@@ -800,7 +800,7 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	int this_cpu;
 
-	set_queue_count(dev, 1, 1);
+	set_queue_count(dev, 1);
 
 	this_cpu = get_cpu();
 	dev->queues[1] = nvme_create_queue(dev, 1, NVME_Q_DEPTH, this_cpu);

From 1b23484bd012c078de2ea939249e2fb2e85a0a6e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 13:01:49 -0500
Subject: [PATCH 005/105] NVMe: Implement per-CPU queues

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 61 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 7efd7e92b637..b6a213c98584 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -172,11 +172,17 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
 {
-	return ns->dev->queues[1];
+	int qid, cpu = get_cpu();
+	if (cpu < ns->dev->queue_count)
+		qid = cpu + 1;
+	else
+		qid = (cpu % rounddown_pow_of_two(ns->dev->queue_count)) + 1;
+	return ns->dev->queues[qid];
 }
 
 static void put_nvmeq(struct nvme_queue *nvmeq)
 {
+	put_cpu();
 }
 
 /**
@@ -795,19 +801,51 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 	return min(result & 0xffff, result >> 16) + 1;
 }
 
-/* XXX: Create per-CPU queues */
 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	int this_cpu;
+	int result, cpu, i, nr_queues;
 
-	set_queue_count(dev, 1);
+	nr_queues = num_online_cpus();
+	result = set_queue_count(dev, nr_queues);
+	if (result < 0)
+		return result;
+	if (result < nr_queues)
+		nr_queues = result;
 
-	this_cpu = get_cpu();
-	dev->queues[1] = nvme_create_queue(dev, 1, NVME_Q_DEPTH, this_cpu);
-	put_cpu();
-	if (!dev->queues[1])
-		return -ENOMEM;
-	dev->queue_count++;
+	/* Deregister the admin queue's interrupt */
+	free_irq(dev->entry[0].vector, dev->queues[0]);
+
+	for (i = 0; i < nr_queues; i++)
+		dev->entry[i].entry = i;
+	for (;;) {
+		result = pci_enable_msix(dev->pci_dev, dev->entry, nr_queues);
+		if (result == 0) {
+			break;
+		} else if (result > 0) {
+			nr_queues = result;
+			continue;
+		} else {
+			nr_queues = 1;
+			break;
+		}
+	}
+
+	result = queue_request_irq(dev, dev->queues[0], "nvme admin");
+	/* XXX: handle failure here */
+
+	cpu = cpumask_first(cpu_online_mask);
+	for (i = 0; i < nr_queues; i++) {
+		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
+		cpu = cpumask_next(cpu, cpu_online_mask);
+	}
+
+	for (i = 0; i < nr_queues; i++) {
+		dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
+							NVME_Q_DEPTH, i);
+		if (!dev->queues[i + 1])
+			return -ENOMEM;
+		dev->queue_count++;
+	}
 
 	return 0;
 }
@@ -931,7 +969,8 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 								GFP_KERNEL);
 	if (!dev->entry)
 		goto free;
-	dev->queues = kcalloc(2, sizeof(void *), GFP_KERNEL);
+	dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
+								GFP_KERNEL);
 	if (!dev->queues)
 		goto free;
 

From 821234603b265f59d7eebce16d9e8beca2a5752d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 13:24:06 -0500
Subject: [PATCH 006/105] NVMe: Rename 'cycle' to 'phase'

It's called the phase bit in the current draft

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b6a213c98584..3d917a87ea93 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -93,7 +93,7 @@ struct nvme_queue {
 	u16 sq_head;
 	u16 sq_tail;
 	u16 cq_head;
-	u16 cq_cycle;
+	u16 cq_phase;
 	unsigned long cmdid_data[];
 };
 
@@ -364,7 +364,7 @@ typedef void (*completion_fn)(struct nvme_queue *, void *,
 
 static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 {
-	u16 head, cycle;
+	u16 head, phase;
 
 	static const completion_fn completions[4] = {
 		[sync_completion_id] = sync_completion,
@@ -372,19 +372,19 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 	};
 
 	head = nvmeq->cq_head;
-	cycle = nvmeq->cq_cycle;
+	phase = nvmeq->cq_phase;
 
 	for (;;) {
 		unsigned long data;
 		void *ptr;
 		unsigned char handler;
 		struct nvme_completion cqe = nvmeq->cqes[head];
-		if ((le16_to_cpu(cqe.status) & 1) != cycle)
+		if ((le16_to_cpu(cqe.status) & 1) != phase)
 			break;
 		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
 		if (++head == nvmeq->q_depth) {
 			head = 0;
-			cycle = !cycle;
+			phase = !phase;
 		}
 
 		data = free_cmdid(nvmeq, cqe.command_id);
@@ -399,12 +399,12 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 	 * requires that 0.1% of your interrupts are handled, so this isn't
 	 * a big problem.
 	 */
-	if (head == nvmeq->cq_head && cycle == nvmeq->cq_cycle)
+	if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
 		return IRQ_NONE;
 
 	writel(head, nvmeq->q_db + 1);
 	nvmeq->cq_head = head;
-	nvmeq->cq_cycle = cycle;
+	nvmeq->cq_phase = phase;
 
 	return IRQ_HANDLED;
 }
@@ -557,7 +557,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->q_dmadev = dmadev;
 	spin_lock_init(&nvmeq->q_lock);
 	nvmeq->cq_head = 0;
-	nvmeq->cq_cycle = 1;
+	nvmeq->cq_phase = 1;
 	init_waitqueue_head(&nvmeq->sq_full);
 	bio_list_init(&nvmeq->sq_cong);
 	nvmeq->q_db = &dev->dbs[qid * 2];

From 53c9577e9ca68a633c6e9df2b54eaecacfa77f62 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Jan 2011 13:42:34 -0500
Subject: [PATCH 007/105] NVMe: Fix admin IRQ claim on real hardware

The admin IRQ is supposed to use the pin-based (or single message MSI)
interrupt.  Accomplish this by filling in entry[0]'s vector with the
INTx irq number.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 3d917a87ea93..44a9d5edd8db 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -979,6 +979,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	pci_set_drvdata(pdev, dev);
 	dma_set_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
 	nvme_set_instance(dev);
+	dev->entry[0].vector = pdev->irq;
 
 	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
 	if (!dev->bar) {

From 36c14ed9caa957c686d4a48fd598a5ec2aa0331b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 24 Jan 2011 07:52:07 -0500
Subject: [PATCH 008/105] NVMe: Use PRP2 for the nvme_identify ioctl

DMA the result straight to userspace instead of bounce-buffering in the
kernel.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 56 +++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 44a9d5edd8db..c0ef1dd1cc90 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -647,33 +647,57 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static int nvme_identify(struct nvme_ns *ns, void __user *addr, int cns)
+static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 {
 	struct nvme_dev *dev = ns->dev;
-	int status;
+	int i, err, count, nents, offset;
 	struct nvme_command c;
-	void *page;
-	dma_addr_t dma_addr;
+	struct scatterlist sg[2];
+	struct page *pages[2];
 
-	page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
-								GFP_KERNEL);
+	if (addr & 3)
+		return -EINVAL;
+	offset = offset_in_page(addr);
+	count = offset ? 2 : 1;
+
+	err = get_user_pages_fast(addr, count, 1, pages);
+	if (err < count) {
+		count = err;
+		err = -EFAULT;
+		goto put_pages;
+	}
+	sg_init_table(sg, count);
+	for (i = 0; i < count; i++)
+		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+	nents = dma_map_sg(&dev->pci_dev->dev, sg, count, DMA_FROM_DEVICE);
+	if (!nents)
+		goto put_pages;
 
 	memset(&c, 0, sizeof(c));
 	c.identify.opcode = nvme_admin_identify;
 	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
-	c.identify.prp1 = cpu_to_le64(dma_addr);
+	c.identify.prp1 = cpu_to_le64(sg_dma_address(&sg[0]) + offset);
+	if (count > 1) {
+		u64 dma_addr;
+		if (nents > 1)
+			dma_addr = sg_dma_address(&sg[1]);
+		else
+			dma_addr = sg_dma_address(&sg[0]) + PAGE_SIZE;
+		c.identify.prp2 = cpu_to_le64(dma_addr);
+	}
 	c.identify.cns = cpu_to_le32(cns);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	err = nvme_submit_admin_cmd(dev, &c, NULL);
 
-	if (status)
-		status = -EIO;
-	else if (copy_to_user(addr, page, 4096))
-		status = -EFAULT;
+	if (err)
+		err = -EIO;
 
-	dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
+	dma_unmap_sg(&dev->pci_dev->dev, sg, nents, DMA_FROM_DEVICE);
+ put_pages:
+	for (i = 0; i < count; i++)
+		put_page(pages[i]);
 
-	return status;
+	return err;
 }
 
 static int nvme_get_range_type(struct nvme_ns *ns, void __user *addr)
@@ -713,9 +737,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 
 	switch (cmd) {
 	case NVME_IOCTL_IDENTIFY_NS:
-		return nvme_identify(ns, (void __user *)arg, 0);
+		return nvme_identify(ns, arg, 0);
 	case NVME_IOCTL_IDENTIFY_CTRL:
-		return nvme_identify(ns, (void __user *)arg, 1);
+		return nvme_identify(ns, arg, 1);
 	case NVME_IOCTL_GET_RANGE_TYPE:
 		return nvme_get_range_type(ns, (void __user *)arg);
 	default:

From 7b4fe9b1cb4b9a6f4ae23a12ef96d08d96e2a5da Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 26 Jan 2011 10:01:21 -0500
Subject: [PATCH 009/105] NVMe: Make nvme_common_command more featureful

Add prp1, prp2 and the metadata prp to the common command, since the
fields are generally used this way.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 include/linux/nvme.h | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9ba53584f722..1c0b5ef08959 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -142,6 +142,18 @@ enum nvme_opcode {
 	nvme_cmd_dsm		= 0x09,
 };
 
+struct nvme_common_command {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2;
+	__le64			metadata;
+	__le64			prp1;
+	__le64			prp2;
+	__u32			rsvd10[6];
+};
+
 struct nvme_rw_command {
 	__u8			opcode;
 	__u8			flags;
@@ -284,14 +296,6 @@ struct nvme_delete_queue {
 	__le32			rsvd11[5];
 };
 
-struct nvme_common_command {
-	__u8			opcode;
-	__u8			flags;
-	__u16			command_id;
-	__le32			nsid;
-	__u32			rsvd2[14];
-};
-
 struct nvme_command {
 	union {
 		struct nvme_common_command common;

From ff22b54fda2078fc3cd1bcdcb7a5ce5d08fd6591 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 26 Jan 2011 10:02:29 -0500
Subject: [PATCH 010/105] NVMe: Add nvme_setup_prps()

Generalise the code from nvme_identify() that sets PRP1 & PRP2 so that
it's usable for commands sent by nvme_submit_bio_queue().

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 70 +++++++++++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 24 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index c0ef1dd1cc90..1e57737b1760 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -240,6 +240,36 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 	bio_endio(bio, status ? -EIO : 0);
 }
 
+/* length is in bytes */
+static void nvme_setup_prps(struct nvme_common_command *cmd,
+					struct scatterlist *sg, int length)
+{
+	int dma_len = sg_dma_len(sg);
+	u64 dma_addr = sg_dma_address(sg);
+	int offset = offset_in_page(dma_addr);
+
+	cmd->prp1 = cpu_to_le64(dma_addr);
+	length -= (PAGE_SIZE - offset);
+	if (length <= 0)
+		return;
+
+	dma_len -= (PAGE_SIZE - offset);
+	if (dma_len) {
+		dma_addr += (PAGE_SIZE - offset);
+	} else {
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+	if (length <= PAGE_SIZE) {
+		cmd->prp2 = cpu_to_le64(dma_addr);
+		return;
+	}
+
+	/* XXX: support PRP lists */
+}
+
 static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
@@ -261,7 +291,7 @@ static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
 static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								struct bio *bio)
 {
-	struct nvme_rw_command *cmnd;
+	struct nvme_command *cmnd;
 	struct nvme_req_info *info;
 	enum dma_data_direction dma_dir;
 	int cmdid;
@@ -290,27 +320,26 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
 	spin_lock_irqsave(&nvmeq->q_lock, flags);
-	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail].rw;
+	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
 	if (bio_data_dir(bio)) {
-		cmnd->opcode = nvme_cmd_write;
+		cmnd->rw.opcode = nvme_cmd_write;
 		dma_dir = DMA_TO_DEVICE;
 	} else {
-		cmnd->opcode = nvme_cmd_read;
+		cmnd->rw.opcode = nvme_cmd_read;
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
 	nvme_map_bio(nvmeq->q_dmadev, info, bio, dma_dir, psegs);
 
-	cmnd->flags = 1;
-	cmnd->command_id = cmdid;
-	cmnd->nsid = cpu_to_le32(ns->ns_id);
-	cmnd->prp1 = cpu_to_le64(sg_phys(info->sg));
-	/* XXX: Support more than one PRP */
-	cmnd->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
-	cmnd->length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
-	cmnd->control = cpu_to_le16(control);
-	cmnd->dsmgmt = cpu_to_le32(dsmgmt);
+	cmnd->rw.flags = 1;
+	cmnd->rw.command_id = cmdid;
+	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+	nvme_setup_prps(&cmnd->common, info->sg, bio->bi_size);
+	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
+	cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
+	cmnd->rw.control = cpu_to_le16(control);
+	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
 	writel(nvmeq->sq_tail, nvmeq->q_db);
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
@@ -667,8 +696,9 @@ static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 		goto put_pages;
 	}
 	sg_init_table(sg, count);
-	for (i = 0; i < count; i++)
-		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+	sg_set_page(&sg[0], pages[0], PAGE_SIZE - offset, offset);
+	if (count > 1)
+		sg_set_page(&sg[1], pages[1], offset, 0);
 	nents = dma_map_sg(&dev->pci_dev->dev, sg, count, DMA_FROM_DEVICE);
 	if (!nents)
 		goto put_pages;
@@ -676,15 +706,7 @@ static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 	memset(&c, 0, sizeof(c));
 	c.identify.opcode = nvme_admin_identify;
 	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
-	c.identify.prp1 = cpu_to_le64(sg_dma_address(&sg[0]) + offset);
-	if (count > 1) {
-		u64 dma_addr;
-		if (nents > 1)
-			dma_addr = sg_dma_address(&sg[1]);
-		else
-			dma_addr = sg_dma_address(&sg[0]) + PAGE_SIZE;
-		c.identify.prp2 = cpu_to_le64(dma_addr);
-	}
+	nvme_setup_prps(&c.common, sg, 4096);
 	c.identify.cns = cpu_to_le32(cns);
 
 	err = nvme_submit_admin_cmd(dev, &c, NULL);

From b8deb62cf271fa9381edc8cf52bcae2f0225c55a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 26 Jan 2011 10:08:25 -0500
Subject: [PATCH 011/105] NVMe: Zero the command before we send it

Make sure there's no left-over bits set from previous commands that used
this slot.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 1e57737b1760..25ca7af96469 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -322,6 +322,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	spin_lock_irqsave(&nvmeq->q_lock, flags);
 	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
+	memset(cmnd, 0, sizeof(*cmnd));
 	if (bio_data_dir(bio)) {
 		cmnd->rw.opcode = nvme_cmd_write;
 		dma_dir = DMA_TO_DEVICE;

From bd38c5557cf482fc195e2264b32ea62eed60730a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 26 Jan 2011 14:34:32 -0500
Subject: [PATCH 012/105] NVMe: Change NVME_IOCTL_GET_RANGE_TYPE to return all
 the ranges

Factor out most of nvme_identify() into a new nvme_submit_user_admin_command()
function.  Change nvme_get_range_type() to call it and change nvme_ioctl to
realise that it's getting back all 64 ranges.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 49 +++++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 30 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 25ca7af96469..b28d188d10f8 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -677,18 +677,17 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
+static int nvme_submit_user_admin_command(struct nvme_dev *dev, unsigned long addr,
+					unsigned length, struct nvme_command *cmd)
 {
-	struct nvme_dev *dev = ns->dev;
 	int i, err, count, nents, offset;
-	struct nvme_command c;
 	struct scatterlist sg[2];
 	struct page *pages[2];
 
 	if (addr & 3)
 		return -EINVAL;
 	offset = offset_in_page(addr);
-	count = offset ? 2 : 1;
+	count = ((offset + length) > PAGE_SIZE) ? 2 : 1;
 
 	err = get_user_pages_fast(addr, count, 1, pages);
 	if (err < count) {
@@ -704,13 +703,9 @@ static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 	if (!nents)
 		goto put_pages;
 
-	memset(&c, 0, sizeof(c));
-	c.identify.opcode = nvme_admin_identify;
-	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
-	nvme_setup_prps(&c.common, sg, 4096);
-	c.identify.cns = cpu_to_le32(cns);
+	nvme_setup_prps(&cmd->common, sg, length);
 
-	err = nvme_submit_admin_cmd(dev, &c, NULL);
+	err = nvme_submit_admin_cmd(dev, cmd, NULL);
 
 	if (err)
 		err = -EIO;
@@ -723,34 +718,28 @@ static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 	return err;
 }
 
-static int nvme_get_range_type(struct nvme_ns *ns, void __user *addr)
+static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 {
-	struct nvme_dev *dev = ns->dev;
-	int status;
 	struct nvme_command c;
-	void *page;
-	dma_addr_t dma_addr;
 
-	page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
-								GFP_KERNEL);
+	memset(&c, 0, sizeof(c));
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
+	c.identify.cns = cpu_to_le32(cns);
+
+	return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
+}
+
+static int nvme_get_range_type(struct nvme_ns *ns, unsigned long addr)
+{
+	struct nvme_command c;
 
 	memset(&c, 0, sizeof(c));
 	c.features.opcode = nvme_admin_get_features;
 	c.features.nsid = cpu_to_le32(ns->ns_id);
-	c.features.prp1 = cpu_to_le64(dma_addr);
 	c.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
 
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
-
-	/* XXX: Assuming first range for now */
-	if (status)
-		status = -EIO;
-	else if (copy_to_user(addr, page, 64))
-		status = -EFAULT;
-
-	dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
-
-	return status;
+	return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
 }
 
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
@@ -764,7 +753,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	case NVME_IOCTL_IDENTIFY_CTRL:
 		return nvme_identify(ns, arg, 1);
 	case NVME_IOCTL_GET_RANGE_TYPE:
-		return nvme_get_range_type(ns, (void __user *)arg);
+		return nvme_get_range_type(ns, arg);
 	default:
 		return -ENOTTY;
 	}

From 7fc3cdabba75c2516b8b645eb0ca7907aea70415 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 26 Jan 2011 17:05:50 -0500
Subject: [PATCH 013/105] NVMe: Create nvme_map_user_pages() and
 nvme_unmap_user_pages()

These are generalisations of the code that was in
nvme_submit_user_admin_command().

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 70 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b28d188d10f8..f44d6cd87ea2 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -677,17 +677,22 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static int nvme_submit_user_admin_command(struct nvme_dev *dev, unsigned long addr,
-					unsigned length, struct nvme_command *cmd)
+static int nvme_map_user_pages(struct nvme_dev *dev, int write,
+				unsigned long addr, unsigned length,
+				struct scatterlist **sgp)
 {
 	int i, err, count, nents, offset;
-	struct scatterlist sg[2];
-	struct page *pages[2];
+	struct scatterlist *sg;
+	struct page **pages;
 
 	if (addr & 3)
 		return -EINVAL;
+	if (!length)
+		return -EINVAL;
+
 	offset = offset_in_page(addr);
-	count = ((offset + length) > PAGE_SIZE) ? 2 : 1;
+	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+	pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
 
 	err = get_user_pages_fast(addr, count, 1, pages);
 	if (err < count) {
@@ -695,29 +700,62 @@ static int nvme_submit_user_admin_command(struct nvme_dev *dev, unsigned long ad
 		err = -EFAULT;
 		goto put_pages;
 	}
+
+	sg = kcalloc(count, sizeof(*sg), GFP_KERNEL);
 	sg_init_table(sg, count);
 	sg_set_page(&sg[0], pages[0], PAGE_SIZE - offset, offset);
-	if (count > 1)
-		sg_set_page(&sg[1], pages[1], offset, 0);
-	nents = dma_map_sg(&dev->pci_dev->dev, sg, count, DMA_FROM_DEVICE);
+	length -= (PAGE_SIZE - offset);
+	for (i = 1; i < count; i++) {
+		sg_set_page(&sg[i], pages[i], min_t(int, length, PAGE_SIZE), 0);
+		length -= PAGE_SIZE;
+	}
+
+	err = -ENOMEM;
+	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
+				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	if (!nents)
 		goto put_pages;
 
-	nvme_setup_prps(&cmd->common, sg, length);
+	kfree(pages);
+	*sgp = sg;
+	return nents;
 
-	err = nvme_submit_admin_cmd(dev, cmd, NULL);
-
-	if (err)
-		err = -EIO;
-
-	dma_unmap_sg(&dev->pci_dev->dev, sg, nents, DMA_FROM_DEVICE);
  put_pages:
 	for (i = 0; i < count; i++)
 		put_page(pages[i]);
-
+	kfree(pages);
 	return err;
 }
 
+static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
+				unsigned long addr, int length,
+				struct scatterlist *sg, int nents)
+{
+	int i, count;
+
+	count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE);
+	dma_unmap_sg(&dev->pci_dev->dev, sg, nents, DMA_FROM_DEVICE);
+
+	for (i = 0; i < count; i++)
+		put_page(sg_page(&sg[i]));
+}
+
+static int nvme_submit_user_admin_command(struct nvme_dev *dev,
+					unsigned long addr, unsigned length,
+					struct nvme_command *cmd)
+{
+	int err, nents;
+	struct scatterlist *sg;
+
+	nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
+	if (nents < 0)
+		return nents;
+	nvme_setup_prps(&cmd->common, sg, length);
+	err = nvme_submit_admin_cmd(dev, cmd, NULL);
+	nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
+	return err ? -EIO : 0;
+}
+
 static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
 {
 	struct nvme_command c;

From a53295b6998f62d961c29e54051c1cf1d738c2b3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 16:13:29 -0500
Subject: [PATCH 014/105] NVMe: Add NVME_IOCTL_SUBMIT_IO

Allow userspace to submit synchronous I/O like the SCSI sg interface does.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h | 18 ++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f44d6cd87ea2..40fb2e1bdfe4 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -780,6 +780,47 @@ static int nvme_get_range_type(struct nvme_ns *ns, unsigned long addr)
 	return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
 }
 
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_queue *nvmeq;
+	struct nvme_user_io io;
+	struct nvme_command c;
+	unsigned length;
+	u32 result;
+	int nents, status;
+	struct scatterlist *sg;
+
+	if (copy_from_user(&io, uio, sizeof(io)))
+		return -EFAULT;
+	length = io.nblocks << io.block_shift;
+	nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length, &sg);
+	if (nents < 0)
+		return nents;
+
+	memset(&c, 0, sizeof(c));
+	c.rw.opcode = io.opcode;
+	c.rw.flags = io.flags;
+	c.rw.nsid = cpu_to_le32(io.nsid);
+	c.rw.slba = cpu_to_le64(io.slba);
+	c.rw.length = cpu_to_le16(io.nblocks - 1);
+	c.rw.control = cpu_to_le16(io.control);
+	c.rw.dsmgmt = cpu_to_le16(io.dsmgmt);
+	c.rw.reftag = cpu_to_le32(io.reftag);	/* XXX: endian? */
+	c.rw.apptag = cpu_to_le16(io.apptag);
+	c.rw.appmask = cpu_to_le16(io.appmask);
+	/* XXX: metadata */
+	nvme_setup_prps(&c.common, sg, length);
+
+	nvmeq = get_nvmeq(ns);
+	status = nvme_submit_sync_cmd(nvmeq, &c, &result);
+	put_nvmeq(nvmeq);
+
+	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
+	put_user(result, &uio->result);
+	return status;
+}
+
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 							unsigned long arg)
 {
@@ -792,6 +833,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		return nvme_identify(ns, arg, 1);
 	case NVME_IOCTL_GET_RANGE_TYPE:
 		return nvme_get_range_type(ns, arg);
+	case NVME_IOCTL_SUBMIT_IO:
+		return nvme_submit_io(ns, (void __user *)arg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 1c0b5ef08959..0aaecb059d14 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -340,8 +340,26 @@ struct nvme_completion {
 	__le16	status;		/* did the command fail, and if so, why? */
 };
 
+struct nvme_user_io {
+	__u8	opcode;
+	__u8	flags;
+	__u16	control;
+	__u32	nsid;
+	__u64	metadata;
+	__u64	addr;
+	__u64	slba;
+	__u16	nblocks;
+	__u16	block_shift;
+	__u32	dsmgmt;
+	__u32	reftag;
+	__u16	apptag;
+	__u16	appmask;
+	__u32	result;
+};
+
 #define NVME_IOCTL_IDENTIFY_NS	_IOW('N', 0x40, struct nvme_id_ns)
 #define NVME_IOCTL_IDENTIFY_CTRL _IOW('N', 0x41, struct nvme_id_ctrl)
 #define NVME_IOCTL_GET_RANGE_TYPE _IOW('N', 0x42, struct nvme_lba_range_type)
+#define NVME_IOCTL_SUBMIT_IO	_IOWR('N', 0x43, struct nvme_rw_command)
 
 #endif /* _LINUX_NVME_H */

From 51814232ecae90f888c902e252306df8d017f0dd Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 16:18:08 -0500
Subject: [PATCH 015/105] NVMe: Read the model, serial & firmware rev from the
 controller

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 40fb2e1bdfe4..12e37c1cf057 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -58,6 +58,9 @@ struct nvme_dev {
 	struct msix_entry *entry;
 	struct nvme_bar __iomem *bar;
 	struct list_head namespaces;
+	char serial[20];
+	char model[40];
+	char firmware_rev[8];
 };
 
 /*
@@ -979,6 +982,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 {
 	int res, nn, i;
 	struct nvme_ns *ns, *next;
+	struct nvme_id_ctrl *ctrl;
 	void *id;
 	dma_addr_t dma_addr;
 	struct nvme_command cid, crt;
@@ -1003,7 +1007,11 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 		goto out_free;
 	}
 
-	nn = le32_to_cpup(&((struct nvme_id_ctrl *)id)->nn);
+	ctrl = id;
+	nn = le32_to_cpup(&ctrl->nn);
+	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
+	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
+	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
 
 	cid.identify.cns = 0;
 	memset(&crt, 0, sizeof(crt));

From 8e9f0e71150bf6277d0ea40bc8feb1338ddf13fd Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 31 Jan 2011 10:46:14 -0500
Subject: [PATCH 016/105] NVMe: Remove 'node' from nvme_dev

We don't keep a list of nvme_dev any more

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 12e37c1cf057..9377cf32f813 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -48,7 +48,6 @@ module_param(nvme_major, int, 0);
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
 struct nvme_dev {
-	struct list_head node;
 	struct nvme_queue **queues;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;

From 3f85d50b609e8a5ef151656210203a6e94c19538 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 08:39:04 -0500
Subject: [PATCH 017/105] NVMe: Check returns from nvme_alloc_queue()

It can return NULL, so handle that.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 9377cf32f813..dc821776be94 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -619,6 +619,9 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 	int result;
 	struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
 
+	if (!nvmeq)
+		return NULL;
+
 	result = adapter_alloc_cq(dev, qid, nvmeq);
 	if (result < 0)
 		goto free_nvmeq;
@@ -655,6 +658,8 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
 
 	nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+	if (!nvmeq)
+		return -ENOMEM;
 
 	aqa = nvmeq->q_depth - 1;
 	aqa |= aqa << 16;

From 0ee5a7d7cb9309bd393a25c395f19fb12a842602 Mon Sep 17 00:00:00 2001
From: Shane Michael Matthews <shane.matthews@intel.com>
Date: Tue, 1 Feb 2011 08:49:30 -0500
Subject: [PATCH 018/105] NVMe: Enable and disable the PCI device

Call pci_enable_device_mem() at initialisation and pci_disable_device
at exit.

Signed-off-by: Shane Michael Matthews <shane.matthews@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index dc821776be94..1dda4b5c2302 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1103,6 +1103,9 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	if (!dev->queues)
 		goto free;
 
+	if (pci_enable_device_mem(pdev))
+		goto free;
+
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->pci_dev = pdev;
 	pci_set_drvdata(pdev, dev);
@@ -1133,6 +1136,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
  disable:
 	pci_disable_msix(pdev);
 	nvme_release_instance(dev);
+	pci_disable_device(pdev);
  free:
 	kfree(dev->queues);
 	kfree(dev->entry);
@@ -1147,6 +1151,7 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
 	pci_disable_msix(pdev);
 	iounmap(dev->bar);
 	nvme_release_instance(dev);
+	pci_disable_device(pdev);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);

From f64d3365a3e5cb46e69db7e2c82a7cb9a5bed1b8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 09:01:59 -0500
Subject: [PATCH 019/105] NVMe: Enable device DMA

Need to call pci_set_master() to enable device DMA

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 1dda4b5c2302..128fd70031a9 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1105,6 +1105,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 
 	if (pci_enable_device_mem(pdev))
 		goto free;
+	pci_set_master(pdev);
 
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->pci_dev = pdev;

From 2930353f9f2b9e4629e935acd970cb73c1171229 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 16:23:39 -0500
Subject: [PATCH 020/105] NVMe: Allow queues to be allocated above 4GB

Need to call dma_set_coherent_mask() to allow queues to be allocated
above 4GB.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 128fd70031a9..46f872021369 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1110,7 +1110,8 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->pci_dev = pdev;
 	pci_set_drvdata(pdev, dev);
-	dma_set_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
+	dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
+	dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
 	nvme_set_instance(dev);
 	dev->entry[0].vector = pdev->irq;
 

From 574e8b95bc3780e10e9b5e9d51074d503dd3d5d9 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 16:24:35 -0500
Subject: [PATCH 021/105] NVMe: Request I/O regions

Calling pci_request_selected_regions() reserves these regions for our use.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 46f872021369..bda91178f475 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1088,7 +1088,7 @@ static void nvme_release_instance(struct nvme_dev *dev)
 static int __devinit nvme_probe(struct pci_dev *pdev,
 						const struct pci_device_id *id)
 {
-	int result = -ENOMEM;
+	int bars, result = -ENOMEM;
 	struct nvme_dev *dev;
 
 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
@@ -1106,6 +1106,9 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	if (pci_enable_device_mem(pdev))
 		goto free;
 	pci_set_master(pdev);
+	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	if (pci_request_selected_regions(pdev, bars, "nvme"))
+		goto disable;
 
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->pci_dev = pdev;
@@ -1118,7 +1121,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
 	if (!dev->bar) {
 		result = -ENOMEM;
-		goto disable;
+		goto disable_msix;
 	}
 
 	result = nvme_configure_admin_queue(dev);
@@ -1135,10 +1138,12 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	nvme_free_queues(dev);
  unmap:
 	iounmap(dev->bar);
- disable:
+ disable_msix:
 	pci_disable_msix(pdev);
 	nvme_release_instance(dev);
+ disable:
 	pci_disable_device(pdev);
+	pci_release_regions(pdev);
  free:
 	kfree(dev->queues);
 	kfree(dev->entry);
@@ -1154,6 +1159,7 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
 	iounmap(dev->bar);
 	nvme_release_instance(dev);
 	pci_disable_device(pdev);
+	pci_release_regions(pdev);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);

From 5911f20039ce59d7e7834f0c42151cf759b6f786 Mon Sep 17 00:00:00 2001
From: Shane Michael Matthews <shane.matthews@intel.com>
Date: Tue, 1 Feb 2011 11:31:55 -0500
Subject: [PATCH 022/105] NVMe: Disable the device before we write the admin
 queues

In case the card has been left in a partially-configured state,
write 0 to the Enable bit.

Signed-off-by: Shane Michael Matthews <shane.matthews@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index bda91178f475..e3d921577b94 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -668,6 +668,7 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
 	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
 
+	writel(0, &dev->bar->cc);
 	writel(aqa, &dev->bar->aqa);
 	writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
 	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);

From 388f037f4e7f0a24bac6b1a24f144f5d939f58cf Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 1 Feb 2011 12:49:38 -0500
Subject: [PATCH 023/105] NVMe: Move sysfs entries to the right place

Because I wasn't setting driverfs_dev, the devices were showing up under
/sys/devices/virtual/block.  Now they appear underneath the PCI device
which they belong to.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index e3d921577b94..744db3877c42 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -889,6 +889,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
 	disk->fops = &nvme_fops;
 	disk->private_data = ns;
 	disk->queue = ns->queue;
+	disk->driverfs_dev = &dev->pci_dev->dev;
 	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, index);
 	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
 

From 7a63e07b9a98b77dd075e06b93c1d8dc871ddad5 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 3 Feb 2011 09:20:57 -0500
Subject: [PATCH 024/105] NVMe: Add remaining status codes

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 include/linux/nvme.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 0aaecb059d14..dbbdc126401b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -321,14 +321,29 @@ enum {
 	NVME_SC_ABORT_QUEUE		= 0x8,
 	NVME_SC_FUSED_FAIL		= 0x9,
 	NVME_SC_FUSED_MISSING		= 0xa,
+	NVME_SC_INVALID_NS		= 0xb,
 	NVME_SC_LBA_RANGE		= 0x80,
 	NVME_SC_CAP_EXCEEDED		= 0x81,
 	NVME_SC_NS_NOT_READY		= 0x82,
 	NVME_SC_CQ_INVALID		= 0x100,
 	NVME_SC_QID_INVALID		= 0x101,
 	NVME_SC_QUEUE_SIZE		= 0x102,
+	NVME_SC_ABORT_LIMIT		= 0x103,
+	NVME_SC_ABORT_MISSING		= 0x104,
+	NVME_SC_ASYNC_LIMIT		= 0x105,
+	NVME_SC_FIRMWARE_SLOT		= 0x106,
+	NVME_SC_FIRMWARE_IMAGE		= 0x107,
+	NVME_SC_INVALID_VECTOR		= 0x108,
+	NVME_SC_INVALID_LOG_PAGE	= 0x109,
+	NVME_SC_INVALID_FORMAT		= 0x10a,
+	NVME_SC_BAD_ATTRIBUTES		= 0x180,
 	NVME_SC_WRITE_FAULT		= 0x280,
 	NVME_SC_READ_ERROR		= 0x281,
+	NVME_SC_GUARD_CHECK		= 0x282,
+	NVME_SC_APPTAG_CHECK		= 0x283,
+	NVME_SC_REFTAG_CHECK		= 0x284,
+	NVME_SC_COMPARE_FAILED		= 0x285,
+	NVME_SC_ACCESS_DENIED		= 0x286,
 };
 
 struct nvme_completion {

From 6ee44cdced04a53dc4f27eb97067e6cd33784726 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 3 Feb 2011 10:58:26 -0500
Subject: [PATCH 025/105] NVMe: Add download / activate firmware ioctls

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 45 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h | 33 ++++++++++++++++++++++++++------
 2 files changed, 72 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 744db3877c42..7cdf7f69cdcd 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -829,6 +829,47 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	return status;
 }
 
+static int nvme_download_firmware(struct nvme_ns *ns,
+						struct nvme_dlfw __user *udlfw)
+{
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_dlfw dlfw;
+	struct nvme_command c;
+	int nents, status;
+	struct scatterlist *sg;
+
+	if (copy_from_user(&dlfw, udlfw, sizeof(dlfw)))
+		return -EFAULT;
+	if (dlfw.length >= (1 << 30))
+		return -EINVAL;
+
+	nents = nvme_map_user_pages(dev, 1, dlfw.addr, dlfw.length * 4, &sg);
+	if (nents < 0)
+		return nents;
+
+	memset(&c, 0, sizeof(c));
+	c.dlfw.opcode = nvme_admin_download_fw;
+	c.dlfw.numd = cpu_to_le32(dlfw.length);
+	c.dlfw.offset = cpu_to_le32(dlfw.offset);
+	nvme_setup_prps(&c.common, sg, dlfw.length * 4);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
+	return status;
+}
+
+static int nvme_activate_firmware(struct nvme_ns *ns, unsigned long arg)
+{
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_activate_fw;
+	c.common.rsvd10[0] = cpu_to_le32(arg);
+
+	return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 							unsigned long arg)
 {
@@ -843,6 +884,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		return nvme_get_range_type(ns, arg);
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, (void __user *)arg);
+	case NVME_IOCTL_DOWNLOAD_FW:
+		return nvme_download_firmware(ns, (void __user *)arg);
+	case NVME_IOCTL_ACTIVATE_FW:
+		return nvme_activate_firmware(ns, arg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index dbbdc126401b..8eed0e432eef 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -262,7 +262,7 @@ struct nvme_create_cq {
 	__u8			opcode;
 	__u8			flags;
 	__u16			command_id;
-	__le32			rsvd1[5];
+	__u32			rsvd1[5];
 	__le64			prp1;
 	__u64			rsvd8;
 	__le16			cqid;
@@ -276,14 +276,14 @@ struct nvme_create_sq {
 	__u8			opcode;
 	__u8			flags;
 	__u16			command_id;
-	__le32			rsvd1[5];
+	__u32			rsvd1[5];
 	__le64			prp1;
 	__u64			rsvd8;
 	__le16			sqid;
 	__le16			qsize;
 	__le16			sq_flags;
 	__le16			cqid;
-	__le32			rsvd12[4];
+	__u32			rsvd12[4];
 };
 
 struct nvme_delete_queue {
@@ -292,8 +292,20 @@ struct nvme_delete_queue {
 	__u16			command_id;
 	__u32			rsvd1[9];
 	__le16			qid;
-	__le16			rsvd10;
-	__le32			rsvd11[5];
+	__u16			rsvd10;
+	__u32			rsvd11[5];
+};
+
+struct nvme_download_firmware {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__u32			rsvd1[5];
+	__le64			prp1;
+	__le64			prp2;
+	__le32			numd;
+	__le32			offset;
+	__u32			rsvd12[4];
 };
 
 struct nvme_command {
@@ -305,6 +317,7 @@ struct nvme_command {
 		struct nvme_create_cq create_cq;
 		struct nvme_create_sq create_sq;
 		struct nvme_delete_queue delete_queue;
+		struct nvme_download_firmware dlfw;
 	};
 };
 
@@ -348,7 +361,7 @@ enum {
 
 struct nvme_completion {
 	__le32	result;		/* Used by admin commands to return data */
-	__le32	rsvd;
+	__u32	rsvd;
 	__le16	sq_head;	/* how much of this queue may be reclaimed */
 	__le16	sq_id;		/* submission queue that generated this entry */
 	__u16	command_id;	/* of the command which completed */
@@ -372,9 +385,17 @@ struct nvme_user_io {
 	__u32	result;
 };
 
+struct nvme_dlfw {
+	__u64	addr;
+	__u32	length;	/* In dwords */
+	__u32	offset;	/* In dwords */
+};
+
 #define NVME_IOCTL_IDENTIFY_NS	_IOW('N', 0x40, struct nvme_id_ns)
 #define NVME_IOCTL_IDENTIFY_CTRL _IOW('N', 0x41, struct nvme_id_ctrl)
 #define NVME_IOCTL_GET_RANGE_TYPE _IOW('N', 0x42, struct nvme_lba_range_type)
 #define NVME_IOCTL_SUBMIT_IO	_IOWR('N', 0x43, struct nvme_rw_command)
+#define NVME_IOCTL_DOWNLOAD_FW	_IOR('N', 0x44, struct nvme_dlfw)
+#define NVME_IOCTL_ACTIVATE_FW	_IO('N', 0x45)
 
 #endif /* _LINUX_NVME_H */

From db5d0c198d673b6a932b449d4db95a2ad50c755e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 3 Feb 2011 14:36:07 -0500
Subject: [PATCH 026/105] NVMe: Release 0.2

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 7cdf7f69cdcd..06a6aeaa827a 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1272,6 +1272,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.1");
+MODULE_VERSION("0.2");
 module_init(nvme_init);
 module_exit(nvme_exit);

From 3c0cf138d7789feb3f335f6f1d24ad8fc8b3a23f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 4 Feb 2011 16:03:56 -0500
Subject: [PATCH 027/105] NVMe: Allow fatal signals to interrupt I/O

If the user sends a fatal signal, sleeping in the TASK_KILLABLE state
permits the task to be aborted.  The only wrinkle is making sure that
if/when the command completes later that it doesn't upset anything.
Handle this by setting the data pointer to 0, and checking the value
isn't NULL in the sync completion path.  Eventually, bios can be cancelled
through this path too.  Note that the cmdid isn't freed to prevent reuse.

We should also abort the command in the future, but this is a good start.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 06a6aeaa827a..4bfed59f3629 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -155,7 +155,9 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 }
 
 /* If you need more than four handlers, you'll need to change how
- * alloc_cmdid and nvme_process_cq work
+ * alloc_cmdid and nvme_process_cq work.  Also, aborted commands take
+ * the sync_completion path (if they complete), so don't put anything
+ * else in slot zero.
  */
 enum {
 	sync_completion_id = 0,
@@ -172,6 +174,11 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 	return data;
 }
 
+static void clear_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
+{
+	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)] = 0;
+}
+
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
 {
 	int qid, cpu = get_cpu();
@@ -386,6 +393,8 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct sync_cmd_info *cmdinfo = ctx;
+	if (!cmdinfo)
+		return;	/* Command aborted */
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	wake_up_process(cmdinfo->task);
@@ -446,12 +455,19 @@ static irqreturn_t nvme_irq(int irq, void *data)
 	return nvme_process_cq(data);
 }
 
+static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
+{
+	spin_lock_irq(&nvmeq->q_lock);
+	clear_cmdid_data(nvmeq, cmdid);
+	spin_unlock_irq(&nvmeq->q_lock);
+}
+
 /*
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
  */
-static int nvme_submit_sync_cmd(struct nvme_queue *q, struct nvme_command *cmd,
-								u32 *result)
+static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
+					struct nvme_command *cmd, u32 *result)
 {
 	int cmdid;
 	struct sync_cmd_info cmdinfo;
@@ -459,15 +475,20 @@ static int nvme_submit_sync_cmd(struct nvme_queue *q, struct nvme_command *cmd,
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
-	cmdid = alloc_cmdid_killable(q, &cmdinfo, sync_completion_id);
+	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion_id);
 	if (cmdid < 0)
 		return cmdid;
 	cmd->common.command_id = cmdid;
 
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	nvme_submit_cmd(q, cmd);
+	set_current_state(TASK_KILLABLE);
+	nvme_submit_cmd(nvmeq, cmd);
 	schedule();
 
+	if (cmdinfo.status == -EINTR) {
+		nvme_abort_command(nvmeq, cmdid);
+		return -EINTR;
+	}
+
 	if (result)
 		*result = cmdinfo.result;
 

From b1ad37efcafe396ac3944853589688dd0ec3c64e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 4 Feb 2011 16:14:30 -0500
Subject: [PATCH 028/105] NVMe: Call put_nvmeq() before calling
 nvme_submit_sync_cmd()

We can't have preemption disabled when we call schedule().  Accept the
possibility that we'll get preempted, and it'll cost us some cacheline
bounces.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 4bfed59f3629..1c3cd6cc0ad9 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -842,8 +842,13 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	nvme_setup_prps(&c.common, sg, length);
 
 	nvmeq = get_nvmeq(ns);
-	status = nvme_submit_sync_cmd(nvmeq, &c, &result);
+	/* Since nvme_submit_sync_cmd sleeps, we can't keep preemption
+	 * disabled.  We may be preempted at any point, and be rescheduled
+	 * to a different CPU.  That will cause cacheline bouncing, but no
+	 * additional races since q_lock already protects against other CPUs.
+	 */
 	put_nvmeq(nvmeq);
+	status = nvme_submit_sync_cmd(nvmeq, &c, &result);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
 	put_user(result, &uio->result);

From 58ffacb545f76fc2c65d1fbfa5acf5184a2a09e6 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 07:28:06 -0500
Subject: [PATCH 029/105] NVMe: Add a module parameter to use a threaded
 interrupt

We're currently calling bio_endio from hard interrupt context.  This is
not a good idea for preemptible kernels as it will cause longer latencies.
Using a threaded interrupt will run the entire queue processing mechanism
(including bio_endio) in a thread, which can be preempted.  Unfortuantely,
it also adds about 7us of latency to the single-I/O case, so make it a
module parameter for the moment.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 1c3cd6cc0ad9..60c3786bc787 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -44,6 +44,9 @@
 static int nvme_major;
 module_param(nvme_major, int, 0);
 
+static int use_threaded_interrupts;
+module_param(use_threaded_interrupts, int, 0);
+
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
@@ -455,6 +458,25 @@ static irqreturn_t nvme_irq(int irq, void *data)
 	return nvme_process_cq(data);
 }
 
+static irqreturn_t nvme_irq_thread(int irq, void *data)
+{
+	irqreturn_t result;
+	struct nvme_queue *nvmeq = data;
+	spin_lock(&nvmeq->q_lock);
+	result = nvme_process_cq(nvmeq);
+	spin_unlock(&nvmeq->q_lock);
+	return result;
+}
+
+static irqreturn_t nvme_irq_check(int irq, void *data)
+{
+	struct nvme_queue *nvmeq = data;
+	struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
+	if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
+		return IRQ_NONE;
+	return IRQ_WAKE_THREAD;
+}
+
 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 {
 	spin_lock_irq(&nvmeq->q_lock);
@@ -630,6 +652,11 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 							const char *name)
 {
+	if (use_threaded_interrupts)
+		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
+					nvme_irq_check, nvme_irq_thread,
+					IRQF_DISABLED | IRQF_SHARED,
+					name, nvmeq);
 	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
 				IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
 }

From be7b62754e097adc0cb16c25c9ee86ee20de62fb Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 07:53:23 -0500
Subject: [PATCH 030/105] NVMe: Use a symbolic name to represent cancelled
 commands instead of 0

I have plans for other special values in sync_completion.  Plus, this
is more self-documenting, and lets us detect bogus usages.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 60c3786bc787..802d763d9d06 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -31,6 +31,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -158,15 +159,17 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 }
 
 /* If you need more than four handlers, you'll need to change how
- * alloc_cmdid and nvme_process_cq work.  Also, aborted commands take
- * the sync_completion path (if they complete), so don't put anything
- * else in slot zero.
+ * alloc_cmdid and nvme_process_cq work.  Consider using a special
+ * CMD_CTX value instead, if that works for your situation.
  */
 enum {
 	sync_completion_id = 0,
 	bio_completion_id,
 };
 
+#define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
+#define CMD_CTX_CANCELLED	(0x2008 + CMD_CTX_BASE)
+
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
 	unsigned long data;
@@ -177,9 +180,10 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 	return data;
 }
 
-static void clear_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
+static void cancel_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
 {
-	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)] = 0;
+	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)] =
+							CMD_CTX_CANCELLED;
 }
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
@@ -396,8 +400,8 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct sync_cmd_info *cmdinfo = ctx;
-	if (!cmdinfo)
-		return;	/* Command aborted */
+	if ((unsigned long)cmdinfo == CMD_CTX_CANCELLED)
+		return;
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	wake_up_process(cmdinfo->task);
@@ -480,7 +484,7 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 {
 	spin_lock_irq(&nvmeq->q_lock);
-	clear_cmdid_data(nvmeq, cmdid);
+	cancel_cmdid_data(nvmeq, cmdid);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 

From b36235df01ec4141b4e589571d6789076c346d88 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 08:49:55 -0500
Subject: [PATCH 031/105] NVMe: Detect commands that are completed twice

Set the context value to CMD_CTX_COMPLETED, and print a message in the
sync_completion handler if we see it.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 802d763d9d06..2dd09e7e142d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -169,12 +169,15 @@ enum {
 
 #define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
 #define CMD_CTX_CANCELLED	(0x2008 + CMD_CTX_BASE)
+#define CMD_CTX_COMPLETED	(0x2010 + CMD_CTX_BASE)
 
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
 	unsigned long data;
+	unsigned offset = cmdid + BITS_TO_LONGS(nvmeq->q_depth);
 
-	data = nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)];
+	data = nvmeq->cmdid_data[offset];
+	nvmeq->cmdid_data[offset] = CMD_CTX_COMPLETED;
 	clear_bit(cmdid, nvmeq->cmdid_data);
 	wake_up(&nvmeq->sq_full);
 	return data;
@@ -182,8 +185,8 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 
 static void cancel_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
 {
-	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)] =
-							CMD_CTX_CANCELLED;
+	unsigned offset = cmdid + BITS_TO_LONGS(nvmeq->q_depth);
+	nvmeq->cmdid_data[offset] = CMD_CTX_CANCELLED;
 }
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
@@ -402,6 +405,12 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 	struct sync_cmd_info *cmdinfo = ctx;
 	if ((unsigned long)cmdinfo == CMD_CTX_CANCELLED)
 		return;
+	if (unlikely((unsigned long)cmdinfo == CMD_CTX_COMPLETED)) {
+		dev_warn(nvmeq->q_dmadev,
+				"completed id %d twice on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	wake_up_process(cmdinfo->task);

From 48e3d39816416b3bf03dee3a796c0c04427c1a31 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 08:51:15 -0500
Subject: [PATCH 032/105] NVMe: Detect command IDs completing that are out of
 range

If the adapter completes a command ID that is outside the bounds of
the array, return CMD_CTX_INVALID instead of random data, and print a
message in the sync_completion handler (which is rapidly becoming the
misc completion handler :-)

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 2dd09e7e142d..f4085d4fe0f2 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -170,12 +170,15 @@ enum {
 #define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
 #define CMD_CTX_CANCELLED	(0x2008 + CMD_CTX_BASE)
 #define CMD_CTX_COMPLETED	(0x2010 + CMD_CTX_BASE)
+#define CMD_CTX_INVALID		(0x2014 + CMD_CTX_BASE)
 
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
 	unsigned long data;
 	unsigned offset = cmdid + BITS_TO_LONGS(nvmeq->q_depth);
 
+	if (cmdid > nvmeq->q_depth)
+		return CMD_CTX_INVALID;
 	data = nvmeq->cmdid_data[offset];
 	nvmeq->cmdid_data[offset] = CMD_CTX_COMPLETED;
 	clear_bit(cmdid, nvmeq->cmdid_data);
@@ -411,6 +414,12 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 				cqe->command_id, le16_to_cpup(&cqe->sq_id));
 		return;
 	}
+	if (unlikely((unsigned long)cmdinfo == CMD_CTX_INVALID)) {
+		dev_warn(nvmeq->q_dmadev,
+				"invalid id %d completed on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	wake_up_process(cmdinfo->task);

From ec6ce618d65b5ce1bef83a5509255107a0feac44 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 09:01:00 -0500
Subject: [PATCH 033/105] NVMe: Need to lock queue during interrupt handling

If we're sharing a queue between multiple CPUs and we cancel a sync I/O,
we must have the queue locked to avoid corrupting the stack of the thread
that submitted the I/O.  It turns out this is the same locking that's needed
for the threaded irq handler, so share that code.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f4085d4fe0f2..139e6fc1e2a8 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -476,11 +476,6 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 }
 
 static irqreturn_t nvme_irq(int irq, void *data)
-{
-	return nvme_process_cq(data);
-}
-
-static irqreturn_t nvme_irq_thread(int irq, void *data)
 {
 	irqreturn_t result;
 	struct nvme_queue *nvmeq = data;
@@ -676,7 +671,7 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 {
 	if (use_threaded_interrupts)
 		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
-					nvme_irq_check, nvme_irq_thread,
+					nvme_irq_check, nvme_irq,
 					IRQF_DISABLED | IRQF_SHARED,
 					name, nvmeq);
 	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,

From e85248e516c550382ba33ca325c272a0ca397e44 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 6 Feb 2011 18:30:16 -0500
Subject: [PATCH 034/105] NVMe: Record the timeout for each command

In addition to recording the completion data for each command, record
the anticipated completion time.  Choose a timeout of 5 seconds for
normal I/Os and 60 seconds for admin I/Os.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 49 +++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 139e6fc1e2a8..60c1048dc8bc 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -41,6 +41,8 @@
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 #define NVME_MINORS 64
+#define IO_TIMEOUT	(5 * HZ)
+#define ADMIN_TIMEOUT	(60 * HZ)
 
 static int nvme_major;
 module_param(nvme_major, int, 0);
@@ -119,6 +121,16 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 }
 
+struct nvme_cmd_info {
+	unsigned long ctx;
+	unsigned long timeout;
+};
+
+static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+{
+	return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+}
+
 /**
  * alloc_cmdid - Allocate a Command ID
  * @param nvmeq The queue that will be used for this command
@@ -131,10 +143,11 @@ static inline void _nvme_check_size(void)
  * Passing in a pointer that's not 4-byte aligned will cause a BUG.
  * We can change this if it becomes a problem.
  */
-static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler)
+static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler,
+							unsigned timeout)
 {
 	int depth = nvmeq->q_depth;
-	unsigned long data = (unsigned long)ctx | handler;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 	int cmdid;
 
 	BUG_ON((unsigned long)ctx & 3);
@@ -145,16 +158,17 @@ static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler)
 			return -EBUSY;
 	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
 
-	nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(depth)] = data;
+	info[cmdid].ctx = (unsigned long)ctx | handler;
+	info[cmdid].timeout = jiffies + timeout;
 	return cmdid;
 }
 
 static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
-								int handler)
+						int handler, unsigned timeout)
 {
 	int cmdid;
 	wait_event_killable(nvmeq->sq_full,
-			(cmdid = alloc_cmdid(nvmeq, ctx, handler)) >= 0);
+		(cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
 	return (cmdid < 0) ? -EINTR : cmdid;
 }
 
@@ -175,12 +189,12 @@ enum {
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
 	unsigned long data;
-	unsigned offset = cmdid + BITS_TO_LONGS(nvmeq->q_depth);
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 
-	if (cmdid > nvmeq->q_depth)
+	if (cmdid >= nvmeq->q_depth)
 		return CMD_CTX_INVALID;
-	data = nvmeq->cmdid_data[offset];
-	nvmeq->cmdid_data[offset] = CMD_CTX_COMPLETED;
+	data = info[cmdid].ctx;
+	info[cmdid].ctx = CMD_CTX_COMPLETED;
 	clear_bit(cmdid, nvmeq->cmdid_data);
 	wake_up(&nvmeq->sq_full);
 	return data;
@@ -188,8 +202,8 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 
 static void cancel_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
 {
-	unsigned offset = cmdid + BITS_TO_LONGS(nvmeq->q_depth);
-	nvmeq->cmdid_data[offset] = CMD_CTX_CANCELLED;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	info[cmdid].ctx = CMD_CTX_CANCELLED;
 }
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
@@ -327,7 +341,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		goto congestion;
 	info->bio = bio;
 
-	cmdid = alloc_cmdid(nvmeq, info, bio_completion_id);
+	cmdid = alloc_cmdid(nvmeq, info, bio_completion_id, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		goto free_info;
 
@@ -506,7 +520,7 @@ static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
  * if the result is positive, it's an NVM Express status code
  */
 static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
-					struct nvme_command *cmd, u32 *result)
+			struct nvme_command *cmd, u32 *result, unsigned timeout)
 {
 	int cmdid;
 	struct sync_cmd_info cmdinfo;
@@ -514,7 +528,8 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
-	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion_id);
+	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion_id,
+								timeout);
 	if (cmdid < 0)
 		return cmdid;
 	cmd->common.command_id = cmdid;
@@ -537,7 +552,7 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
 static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 								u32 *result)
 {
-	return nvme_submit_sync_cmd(dev->queues[0], cmd, result);
+	return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -630,7 +645,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 							int depth, int vector)
 {
 	struct device *dmadev = &dev->pci_dev->dev;
-	unsigned extra = (depth + BITS_TO_LONGS(depth)) * sizeof(long);
+	unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info));
 	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
 	if (!nvmeq)
 		return NULL;
@@ -892,7 +907,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	 * additional races since q_lock already protects against other CPUs.
 	 */
 	put_nvmeq(nvmeq);
-	status = nvme_submit_sync_cmd(nvmeq, &c, &result);
+	status = nvme_submit_sync_cmd(nvmeq, &c, &result, IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
 	put_user(result, &uio->result);

From 9294bbed78926a895516ec016ba23033f58d1a88 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 7 Feb 2011 12:45:24 -0500
Subject: [PATCH 035/105] NVMe: Handle the congestion list a little better

In the bio completion handler, check for bios on the congestion list
for this NVM queue.  Also, lock the congestion list in the make_request
function as the queue may end up being shared between multiple CPUs.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 60c1048dc8bc..2a0dd5e60347 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -105,6 +105,8 @@ struct nvme_queue {
 	unsigned long cmdid_data[];
 };
 
+static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio);
+
 /*
  * Check we didin't inadvertently grow the command struct
  */
@@ -274,6 +276,9 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	free_info(info);
 	bio_endio(bio, status ? -EIO : 0);
+	bio = bio_list_pop(&nvmeq->sq_cong);
+	if (bio)
+		nvme_resubmit_bio(nvmeq, bio);
 }
 
 /* length is in bytes */
@@ -392,6 +397,16 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return -EBUSY;
 }
 
+static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio)
+{
+	struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+	if (nvme_submit_bio_queue(nvmeq, ns, bio))
+		bio_list_add_head(&nvmeq->sq_cong, bio);
+	else if (bio_list_empty(&nvmeq->sq_cong))
+		blk_clear_queue_congested(ns->queue, rw_is_sync(bio->bi_rw));
+	/* XXX: Need to duplicate the logic from __freed_request here */
+}
+
 /*
  * NB: return value of non-zero would mean that we were a stacking driver.
  * make_request must always succeed.
@@ -403,7 +418,9 @@ static int nvme_make_request(struct request_queue *q, struct bio *bio)
 
 	if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
 		blk_set_queue_congested(q, rw_is_sync(bio->bi_rw));
+		spin_lock_irq(&nvmeq->q_lock);
 		bio_list_add(&nvmeq->sq_cong, bio);
+		spin_unlock_irq(&nvmeq->q_lock);
 	}
 	put_nvmeq(nvmeq);
 

From d2d8703481f60d67f49e3177196cbe474b11377c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 7 Feb 2011 15:55:59 -0500
Subject: [PATCH 036/105] NVMe: Renumber the special context values

If POISON_POINTER_DELTA isn't defined, ensure they're in page 0 which
should never be mapped.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 2a0dd5e60347..71bdf6f2c93b 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -184,9 +184,9 @@ enum {
 };
 
 #define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
-#define CMD_CTX_CANCELLED	(0x2008 + CMD_CTX_BASE)
-#define CMD_CTX_COMPLETED	(0x2010 + CMD_CTX_BASE)
-#define CMD_CTX_INVALID		(0x2014 + CMD_CTX_BASE)
+#define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
+#define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
+#define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
 
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {

From 51882d00f07da9601cc962a3596e48aafb4f4163 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 08:49:59 -0500
Subject: [PATCH 037/105] NVMe: Advance the sg pointer when filling in an sg
 list

For multipage BIOs, we were always using sg[0] instead of advancing
through the list.  Oops :-)

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 71bdf6f2c93b..903e7f15b60d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -321,6 +321,7 @@ static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
 	sg_init_table(sg, psegs);
 	bio_for_each_segment(bvec, bio, i) {
 		sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
+		sg++;
 		/* XXX: handle non-mergable here */
 		nsegs++;
 	}

From e025344c56e08b155f43ea09647969286c78377c Mon Sep 17 00:00:00 2001
From: Shane Michael Matthews <shane.matthews@intel.com>
Date: Thu, 10 Feb 2011 08:51:24 -0500
Subject: [PATCH 038/105] NVMe: Initial PRP List support

Add a pointer to the nvme_req_info to hold a new data structure
(nvme_prps) which contains a list of the pages allocated to this
particular request for holding PRP list entries.  nvme_setup_prps()
now returns this pointer.

To allocate and free the memory used for PRP lists, we need a struct
device, so we need to pass the nvme_queue pointer to many functions
which didn't use to need it.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 106 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 92 insertions(+), 14 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 903e7f15b60d..b1e8445985a2 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -247,21 +247,55 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	return 0;
 }
 
+static __le64 *alloc_prp_list(struct nvme_queue *nvmeq, int length,
+				 dma_addr_t *addr)
+{
+	return dma_alloc_coherent(nvmeq->q_dmadev, PAGE_SIZE, addr, GFP_ATOMIC);
+}
+
+struct nvme_prps {
+	int npages;
+	dma_addr_t first_dma;
+	__le64 *list[0];
+};
+
+static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
+{
+	const int last_prp = PAGE_SIZE / 8 - 1;
+	int i;
+	dma_addr_t prp_dma;
+
+	if (!prps)
+		return;
+
+	prp_dma = prps->first_dma;
+	for (i = 0; i < prps->npages; i++) {
+		__le64 *prp_list = prps->list[i];
+		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
+		dma_free_coherent(nvmeq->q_dmadev, PAGE_SIZE, prp_list,
+								prp_dma);
+		prp_dma = next_prp_dma;
+	}
+	kfree(prps);
+}
+
 struct nvme_req_info {
 	struct bio *bio;
 	int nents;
+	struct nvme_prps *prps;
 	struct scatterlist sg[0];
 };
 
 /* XXX: use a mempool */
 static struct nvme_req_info *alloc_info(unsigned nseg, gfp_t gfp)
 {
-	return kmalloc(sizeof(struct nvme_req_info) +
+	return kzalloc(sizeof(struct nvme_req_info) +
 			sizeof(struct scatterlist) * nseg, gfp);
 }
 
-static void free_info(struct nvme_req_info *info)
+static void free_info(struct nvme_queue *nvmeq, struct nvme_req_info *info)
 {
+	nvme_free_prps(nvmeq, info->prps);
 	kfree(info);
 }
 
@@ -274,7 +308,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 
 	dma_unmap_sg(nvmeq->q_dmadev, info->sg, info->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	free_info(info);
+	free_info(nvmeq, info);
 	bio_endio(bio, status ? -EIO : 0);
 	bio = bio_list_pop(&nvmeq->sq_cong);
 	if (bio)
@@ -282,17 +316,22 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 }
 
 /* length is in bytes */
-static void nvme_setup_prps(struct nvme_common_command *cmd,
+static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
+					struct nvme_common_command *cmd,
 					struct scatterlist *sg, int length)
 {
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = offset_in_page(dma_addr);
+	__le64 *prp_list;
+	dma_addr_t prp_dma;
+	int nprps, npages, i, prp_page;
+	struct nvme_prps *prps = NULL;
 
 	cmd->prp1 = cpu_to_le64(dma_addr);
 	length -= (PAGE_SIZE - offset);
 	if (length <= 0)
-		return;
+		return prps;
 
 	dma_len -= (PAGE_SIZE - offset);
 	if (dma_len) {
@@ -305,10 +344,42 @@ static void nvme_setup_prps(struct nvme_common_command *cmd,
 
 	if (length <= PAGE_SIZE) {
 		cmd->prp2 = cpu_to_le64(dma_addr);
-		return;
+		return prps;
 	}
 
-	/* XXX: support PRP lists */
+	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
+	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE);
+	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, GFP_ATOMIC);
+	prps->npages = npages;
+	prp_page = 0;
+	prp_list = alloc_prp_list(nvmeq, length, &prp_dma);
+	prps->list[prp_page++] = prp_list;
+	prps->first_dma = prp_dma;
+	cmd->prp2 = cpu_to_le64(prp_dma);
+	i = 0;
+	for (;;) {
+		if (i == PAGE_SIZE / 8 - 1) {
+			__le64 *old_prp_list = prp_list;
+			prp_list = alloc_prp_list(nvmeq, length, &prp_dma);
+			prps->list[prp_page++] = prp_list;
+			old_prp_list[i] = cpu_to_le64(prp_dma);
+			i = 0;
+		}
+		prp_list[i++] = cpu_to_le64(dma_addr);
+		dma_len -= PAGE_SIZE;
+		dma_addr += PAGE_SIZE;
+		length -= PAGE_SIZE;
+		if (length <= 0)
+			break;
+		if (dma_len > 0)
+			continue;
+		BUG_ON(dma_len < 0);
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+	return prps;
 }
 
 static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
@@ -378,7 +449,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-	nvme_setup_prps(&cmnd->common, info->sg, bio->bi_size);
+	info->prps = nvme_setup_prps(nvmeq, &cmnd->common, info->sg,
+								bio->bi_size);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
 	cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
@@ -393,7 +465,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return 0;
 
  free_info:
-	free_info(info);
+	free_info(nvmeq, info);
  congestion:
 	return -EBUSY;
 }
@@ -852,13 +924,15 @@ static int nvme_submit_user_admin_command(struct nvme_dev *dev,
 {
 	int err, nents;
 	struct scatterlist *sg;
+	struct nvme_prps *prps;
 
 	nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
 	if (nents < 0)
 		return nents;
-	nvme_setup_prps(&cmd->common, sg, length);
+	prps = nvme_setup_prps(dev->queues[0], &cmd->common, sg, length);
 	err = nvme_submit_admin_cmd(dev, cmd, NULL);
 	nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
+	nvme_free_prps(dev->queues[0], prps);
 	return err ? -EIO : 0;
 }
 
@@ -896,6 +970,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	u32 result;
 	int nents, status;
 	struct scatterlist *sg;
+	struct nvme_prps *prps;
 
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
@@ -915,10 +990,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.reftag = cpu_to_le32(io.reftag);	/* XXX: endian? */
 	c.rw.apptag = cpu_to_le16(io.apptag);
 	c.rw.appmask = cpu_to_le16(io.appmask);
-	/* XXX: metadata */
-	nvme_setup_prps(&c.common, sg, length);
-
 	nvmeq = get_nvmeq(ns);
+	/* XXX: metadata */
+	prps = nvme_setup_prps(nvmeq, &c.common, sg, length);
+
 	/* Since nvme_submit_sync_cmd sleeps, we can't keep preemption
 	 * disabled.  We may be preempted at any point, and be rescheduled
 	 * to a different CPU.  That will cause cacheline bouncing, but no
@@ -928,6 +1003,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	status = nvme_submit_sync_cmd(nvmeq, &c, &result, IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
+	nvme_free_prps(nvmeq, prps);
 	put_user(result, &uio->result);
 	return status;
 }
@@ -940,6 +1016,7 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 	struct nvme_command c;
 	int nents, status;
 	struct scatterlist *sg;
+	struct nvme_prps *prps;
 
 	if (copy_from_user(&dlfw, udlfw, sizeof(dlfw)))
 		return -EFAULT;
@@ -954,10 +1031,11 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 	c.dlfw.opcode = nvme_admin_download_fw;
 	c.dlfw.numd = cpu_to_le32(dlfw.length);
 	c.dlfw.offset = cpu_to_le32(dlfw.offset);
-	nvme_setup_prps(&c.common, sg, dlfw.length * 4);
+	prps = nvme_setup_prps(dev->queues[0], &c.common, sg, dlfw.length * 4);
 
 	status = nvme_submit_admin_cmd(dev, &c, NULL);
 	nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
+	nvme_free_prps(dev->queues[0], prps);
 	return status;
 }
 

From d534df3c730af9073a9ddc076d9fd65cbdca22b3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 09:03:06 -0500
Subject: [PATCH 039/105] NVMe: Rename nvme_req_info to nvme_bio

There are too many things called 'info' in this driver.  This data
structure is auxiliary information for a struct bio, so call it nvme_bio,
or nbio when used as a variable.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 48 ++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b1e8445985a2..11df0e90edad 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -279,7 +279,7 @@ static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
 	kfree(prps);
 }
 
-struct nvme_req_info {
+struct nvme_bio {
 	struct bio *bio;
 	int nents;
 	struct nvme_prps *prps;
@@ -287,28 +287,28 @@ struct nvme_req_info {
 };
 
 /* XXX: use a mempool */
-static struct nvme_req_info *alloc_info(unsigned nseg, gfp_t gfp)
+static struct nvme_bio *alloc_nbio(unsigned nseg, gfp_t gfp)
 {
-	return kzalloc(sizeof(struct nvme_req_info) +
+	return kzalloc(sizeof(struct nvme_bio) +
 			sizeof(struct scatterlist) * nseg, gfp);
 }
 
-static void free_info(struct nvme_queue *nvmeq, struct nvme_req_info *info)
+static void free_nbio(struct nvme_queue *nvmeq, struct nvme_bio *nbio)
 {
-	nvme_free_prps(nvmeq, info->prps);
-	kfree(info);
+	nvme_free_prps(nvmeq, nbio->prps);
+	kfree(nbio);
 }
 
 static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
-	struct nvme_req_info *info = ctx;
-	struct bio *bio = info->bio;
+	struct nvme_bio *nbio = ctx;
+	struct bio *bio = nbio->bio;
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
-	dma_unmap_sg(nvmeq->q_dmadev, info->sg, info->nents,
+	dma_unmap_sg(nvmeq->q_dmadev, nbio->sg, nbio->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	free_info(nvmeq, info);
+	free_nbio(nvmeq, nbio);
 	bio_endio(bio, status ? -EIO : 0);
 	bio = bio_list_pop(&nvmeq->sq_cong);
 	if (bio)
@@ -382,11 +382,11 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 	return prps;
 }
 
-static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
+static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
 	struct bio_vec *bvec;
-	struct scatterlist *sg = info->sg;
+	struct scatterlist *sg = nbio->sg;
 	int i, nsegs;
 
 	sg_init_table(sg, psegs);
@@ -396,16 +396,16 @@ static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
 		/* XXX: handle non-mergable here */
 		nsegs++;
 	}
-	info->nents = nsegs;
+	nbio->nents = nsegs;
 
-	return dma_map_sg(dev, info->sg, info->nents, dma_dir);
+	return dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir);
 }
 
 static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								struct bio *bio)
 {
 	struct nvme_command *cmnd;
-	struct nvme_req_info *info;
+	struct nvme_bio *nbio;
 	enum dma_data_direction dma_dir;
 	int cmdid;
 	u16 control;
@@ -413,14 +413,14 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	unsigned long flags;
 	int psegs = bio_phys_segments(ns->queue, bio);
 
-	info = alloc_info(psegs, GFP_NOIO);
-	if (!info)
+	nbio = alloc_nbio(psegs, GFP_NOIO);
+	if (!nbio)
 		goto congestion;
-	info->bio = bio;
+	nbio->bio = bio;
 
-	cmdid = alloc_cmdid(nvmeq, info, bio_completion_id, IO_TIMEOUT);
+	cmdid = alloc_cmdid(nvmeq, nbio, bio_completion_id, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
-		goto free_info;
+		goto free_nbio;
 
 	control = 0;
 	if (bio->bi_rw & REQ_FUA)
@@ -444,12 +444,12 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
-	nvme_map_bio(nvmeq->q_dmadev, info, bio, dma_dir, psegs);
+	nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs);
 
 	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-	info->prps = nvme_setup_prps(nvmeq, &cmnd->common, info->sg,
+	nbio->prps = nvme_setup_prps(nvmeq, &cmnd->common, nbio->sg,
 								bio->bi_size);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
 	cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
@@ -464,8 +464,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
 	return 0;
 
- free_info:
-	free_info(nvmeq, info);
+ free_nbio:
+	free_nbio(nvmeq, nbio);
  congestion:
 	return -EBUSY;
 }

From 091b609258b8e01cc45b01a41ca5e496f674d989 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 09:56:01 -0500
Subject: [PATCH 040/105] NVMe: Switch to use DMA Pool API

Calling dma_free_coherent from interrupt context causes warnings.
Using the DMA pools delays freeing until pool destruction, so avoids
the problem.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 11df0e90edad..80fe6a7a8163 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -57,6 +57,7 @@ struct nvme_dev {
 	struct nvme_queue **queues;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
+	struct dma_pool *prp_page_pool;
 	int instance;
 	int queue_count;
 	u32 ctrl_config;
@@ -88,6 +89,7 @@ struct nvme_ns {
  */
 struct nvme_queue {
 	struct device *q_dmadev;
+	struct nvme_dev *dev;
 	spinlock_t q_lock;
 	struct nvme_command *sq_cmds;
 	volatile struct nvme_completion *cqes;
@@ -247,10 +249,9 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	return 0;
 }
 
-static __le64 *alloc_prp_list(struct nvme_queue *nvmeq, int length,
-				 dma_addr_t *addr)
+static __le64 *alloc_prp_list(struct nvme_dev *dev, dma_addr_t *addr)
 {
-	return dma_alloc_coherent(nvmeq->q_dmadev, PAGE_SIZE, addr, GFP_ATOMIC);
+	return dma_pool_alloc(dev->prp_page_pool, GFP_ATOMIC, addr);
 }
 
 struct nvme_prps {
@@ -262,6 +263,7 @@ struct nvme_prps {
 static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
 {
 	const int last_prp = PAGE_SIZE / 8 - 1;
+	struct nvme_dev *dev = nvmeq->dev;
 	int i;
 	dma_addr_t prp_dma;
 
@@ -272,8 +274,7 @@ static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
 	for (i = 0; i < prps->npages; i++) {
 		__le64 *prp_list = prps->list[i];
 		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
-		dma_free_coherent(nvmeq->q_dmadev, PAGE_SIZE, prp_list,
-								prp_dma);
+		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
 		prp_dma = next_prp_dma;
 	}
 	kfree(prps);
@@ -320,6 +321,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 					struct nvme_common_command *cmd,
 					struct scatterlist *sg, int length)
 {
+	struct nvme_dev *dev = nvmeq->dev;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = offset_in_page(dma_addr);
@@ -352,7 +354,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, GFP_ATOMIC);
 	prps->npages = npages;
 	prp_page = 0;
-	prp_list = alloc_prp_list(nvmeq, length, &prp_dma);
+	prp_list = alloc_prp_list(dev, &prp_dma);
 	prps->list[prp_page++] = prp_list;
 	prps->first_dma = prp_dma;
 	cmd->prp2 = cpu_to_le64(prp_dma);
@@ -360,7 +362,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 	for (;;) {
 		if (i == PAGE_SIZE / 8 - 1) {
 			__le64 *old_prp_list = prp_list;
-			prp_list = alloc_prp_list(nvmeq, length, &prp_dma);
+			prp_list = alloc_prp_list(dev, &prp_dma);
 			prps->list[prp_page++] = prp_list;
 			old_prp_list[i] = cpu_to_le64(prp_dma);
 			i = 0;
@@ -752,6 +754,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 		goto free_cqdma;
 
 	nvmeq->q_dmadev = dmadev;
+	nvmeq->dev = dev;
 	spin_lock_init(&nvmeq->q_lock);
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
@@ -1302,6 +1305,22 @@ static int nvme_dev_remove(struct nvme_dev *dev)
 	return 0;
 }
 
+static int nvme_setup_prp_pools(struct nvme_dev *dev)
+{
+	struct device *dmadev = &dev->pci_dev->dev;
+	dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
+						PAGE_SIZE, PAGE_SIZE, 0);
+	if (!dev->prp_page_pool)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void nvme_release_prp_pools(struct nvme_dev *dev)
+{
+	dma_pool_destroy(dev->prp_page_pool);
+}
+
 /* XXX: Use an ida or something to let remove / add work correctly */
 static void nvme_set_instance(struct nvme_dev *dev)
 {
@@ -1346,6 +1365,10 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	nvme_set_instance(dev);
 	dev->entry[0].vector = pdev->irq;
 
+	result = nvme_setup_prp_pools(dev);
+	if (result)
+		goto disable_msix;
+
 	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
 	if (!dev->bar) {
 		result = -ENOMEM;
@@ -1369,6 +1392,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
  disable_msix:
 	pci_disable_msix(pdev);
 	nvme_release_instance(dev);
+	nvme_release_prp_pools(dev);
  disable:
 	pci_disable_device(pdev);
 	pci_release_regions(pdev);
@@ -1386,6 +1410,7 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
 	pci_disable_msix(pdev);
 	iounmap(dev->bar);
 	nvme_release_instance(dev);
+	nvme_release_prp_pools(dev);
 	pci_disable_device(pdev);
 	pci_release_regions(pdev);
 	kfree(dev->queues);

From 99802a7aee2b3dd720e382c52b892cc6a8122b11 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 10:30:34 -0500
Subject: [PATCH 041/105] NVMe: Optimise memory usage for I/Os between 4k and
 128k

Add a second memory pool for smaller I/Os.  We can pack 16 of these on a
single page instead of using an entire page for each one.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 80fe6a7a8163..cd7aeba8310b 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -58,6 +58,7 @@ struct nvme_dev {
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
 	struct dma_pool *prp_page_pool;
+	struct dma_pool *prp_small_pool;
 	int instance;
 	int queue_count;
 	u32 ctrl_config;
@@ -249,11 +250,6 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	return 0;
 }
 
-static __le64 *alloc_prp_list(struct nvme_dev *dev, dma_addr_t *addr)
-{
-	return dma_pool_alloc(dev->prp_page_pool, GFP_ATOMIC, addr);
-}
-
 struct nvme_prps {
 	int npages;
 	dma_addr_t first_dma;
@@ -271,6 +267,9 @@ static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
 		return;
 
 	prp_dma = prps->first_dma;
+
+	if (prps->npages == 0)
+		dma_pool_free(dev->prp_small_pool, prps->list[0], prp_dma);
 	for (i = 0; i < prps->npages; i++) {
 		__le64 *prp_list = prps->list[i];
 		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
@@ -322,6 +321,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 					struct scatterlist *sg, int length)
 {
 	struct nvme_dev *dev = nvmeq->dev;
+	struct dma_pool *pool;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = offset_in_page(dma_addr);
@@ -352,9 +352,16 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
 	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE);
 	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, GFP_ATOMIC);
-	prps->npages = npages;
 	prp_page = 0;
-	prp_list = alloc_prp_list(dev, &prp_dma);
+	if (nprps <= (256 / 8)) {
+		pool = dev->prp_small_pool;
+		prps->npages = 0;
+	} else {
+		pool = dev->prp_page_pool;
+		prps->npages = npages;
+	}
+
+	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 	prps->list[prp_page++] = prp_list;
 	prps->first_dma = prp_dma;
 	cmd->prp2 = cpu_to_le64(prp_dma);
@@ -362,7 +369,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
 	for (;;) {
 		if (i == PAGE_SIZE / 8 - 1) {
 			__le64 *old_prp_list = prp_list;
-			prp_list = alloc_prp_list(dev, &prp_dma);
+			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 			prps->list[prp_page++] = prp_list;
 			old_prp_list[i] = cpu_to_le64(prp_dma);
 			i = 0;
@@ -1313,12 +1320,20 @@ static int nvme_setup_prp_pools(struct nvme_dev *dev)
 	if (!dev->prp_page_pool)
 		return -ENOMEM;
 
+	/* Optimisation for I/Os between 4k and 128k */
+	dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
+						256, 256, 0);
+	if (!dev->prp_small_pool) {
+		dma_pool_destroy(dev->prp_page_pool);
+		return -ENOMEM;
+	}
 	return 0;
 }
 
 static void nvme_release_prp_pools(struct nvme_dev *dev)
 {
 	dma_pool_destroy(dev->prp_page_pool);
+	dma_pool_destroy(dev->prp_small_pool);
 }
 
 /* XXX: Use an ida or something to let remove / add work correctly */

From d567760c409f981d35fc755b51d5bf56a99a467b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 10:47:55 -0500
Subject: [PATCH 042/105] NVMe: Pass the nvme_dev to nvme_free_prps and
 nvme_setup_prps

We were passing the nvme_queue to access the q_dmadev for the
dma_alloc_coherent calls, but since we moved to the dma pool API,
we really only need the nvme_dev.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index cd7aeba8310b..2948043483fe 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -256,10 +256,9 @@ struct nvme_prps {
 	__le64 *list[0];
 };
 
-static void nvme_free_prps(struct nvme_queue *nvmeq, struct nvme_prps *prps)
+static void nvme_free_prps(struct nvme_dev *dev, struct nvme_prps *prps)
 {
 	const int last_prp = PAGE_SIZE / 8 - 1;
-	struct nvme_dev *dev = nvmeq->dev;
 	int i;
 	dma_addr_t prp_dma;
 
@@ -295,7 +294,7 @@ static struct nvme_bio *alloc_nbio(unsigned nseg, gfp_t gfp)
 
 static void free_nbio(struct nvme_queue *nvmeq, struct nvme_bio *nbio)
 {
-	nvme_free_prps(nvmeq, nbio->prps);
+	nvme_free_prps(nvmeq->dev, nbio->prps);
 	kfree(nbio);
 }
 
@@ -316,11 +315,10 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 }
 
 /* length is in bytes */
-static struct nvme_prps *nvme_setup_prps(struct nvme_queue *nvmeq,
+static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 					struct nvme_common_command *cmd,
 					struct scatterlist *sg, int length)
 {
-	struct nvme_dev *dev = nvmeq->dev;
 	struct dma_pool *pool;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
@@ -458,7 +456,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-	nbio->prps = nvme_setup_prps(nvmeq, &cmnd->common, nbio->sg,
+	nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,
 								bio->bi_size);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
 	cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
@@ -939,10 +937,10 @@ static int nvme_submit_user_admin_command(struct nvme_dev *dev,
 	nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
 	if (nents < 0)
 		return nents;
-	prps = nvme_setup_prps(dev->queues[0], &cmd->common, sg, length);
+	prps = nvme_setup_prps(dev, &cmd->common, sg, length);
 	err = nvme_submit_admin_cmd(dev, cmd, NULL);
 	nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
-	nvme_free_prps(dev->queues[0], prps);
+	nvme_free_prps(dev, prps);
 	return err ? -EIO : 0;
 }
 
@@ -1000,10 +998,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.reftag = cpu_to_le32(io.reftag);	/* XXX: endian? */
 	c.rw.apptag = cpu_to_le16(io.apptag);
 	c.rw.appmask = cpu_to_le16(io.appmask);
-	nvmeq = get_nvmeq(ns);
 	/* XXX: metadata */
-	prps = nvme_setup_prps(nvmeq, &c.common, sg, length);
+	prps = nvme_setup_prps(dev, &c.common, sg, length);
 
+	nvmeq = get_nvmeq(ns);
 	/* Since nvme_submit_sync_cmd sleeps, we can't keep preemption
 	 * disabled.  We may be preempted at any point, and be rescheduled
 	 * to a different CPU.  That will cause cacheline bouncing, but no
@@ -1013,7 +1011,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	status = nvme_submit_sync_cmd(nvmeq, &c, &result, IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
-	nvme_free_prps(nvmeq, prps);
+	nvme_free_prps(dev, prps);
 	put_user(result, &uio->result);
 	return status;
 }
@@ -1041,11 +1039,11 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 	c.dlfw.opcode = nvme_admin_download_fw;
 	c.dlfw.numd = cpu_to_le32(dlfw.length);
 	c.dlfw.offset = cpu_to_le32(dlfw.offset);
-	prps = nvme_setup_prps(dev->queues[0], &c.common, sg, dlfw.length * 4);
+	prps = nvme_setup_prps(dev, &c.common, sg, dlfw.length * 4);
 
 	status = nvme_submit_admin_cmd(dev, &c, NULL);
 	nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
-	nvme_free_prps(dev->queues[0], prps);
+	nvme_free_prps(dev, prps);
 	return status;
 }
 

From 1974b1ae8852324a75fb8cfecbc7b758fd5a2c3c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 12:01:09 -0500
Subject: [PATCH 043/105] NVMe: Check for DMA mapping failure

If dma_map_sg returns 0 (failure), we need to fail the I/O.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 2948043483fe..bfdca3a3a41a 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -451,7 +451,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
-	nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs);
+	if (nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs) == 0)
+		goto mapping_failed;
 
 	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
@@ -471,6 +472,11 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
 	return 0;
 
+ mapping_failed:
+	free_nbio(nvmeq, nbio);
+	bio_endio(bio, -ENOMEM);
+	return 0;
+
  free_nbio:
 	free_nbio(nvmeq, nbio);
  congestion:

From 768308400f5b4ce665a072eb976a851978b7706e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 10 Feb 2011 13:55:39 -0500
Subject: [PATCH 044/105] NVMe: Handle physical merging of bvec entries

In order to not overrun the sg array, we have to merge physically
contiguous pages into a single sg entry.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index bfdca3a3a41a..c0e84b688f50 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -392,19 +392,25 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
-	struct bio_vec *bvec;
-	struct scatterlist *sg = nbio->sg;
-	int i, nsegs;
+	struct bio_vec *bvec, *bvprv = NULL;
+	struct scatterlist *sg = NULL;
+	int i, nsegs = 0;
 
-	sg_init_table(sg, psegs);
+	sg_init_table(nbio->sg, psegs);
 	bio_for_each_segment(bvec, bio, i) {
-		sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
-		sg++;
-		/* XXX: handle non-mergable here */
-		nsegs++;
+		if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
+			sg->length += bvec->bv_len;
+		} else {
+			/* Check bvprv && offset == 0 */
+			sg = sg ? sg + 1 : nbio->sg;
+			sg_set_page(sg, bvec->bv_page, bvec->bv_len,
+							bvec->bv_offset);
+			nsegs++;
+		}
+		bvprv = bvec;
 	}
 	nbio->nents = nsegs;
-
+	sg_mark_end(sg);
 	return dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir);
 }
 

From 897cfe1ce7db152fa6dde576f4213a6160bf6502 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 14 Feb 2011 12:20:15 -0500
Subject: [PATCH 045/105] NVMe: Update BAR structure to match the current spec

Add two reserved registers in the middle of the BAR to match the 1.0
spec plus ECN 0002.

Also rename IMC and ISC to INTMC and INTSC to conform with the spec.
We still don't need to use them :-)

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 include/linux/nvme.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 8eed0e432eef..757faa71666e 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -24,10 +24,12 @@
 struct nvme_bar {
 	__u64			cap;	/* Controller Capabilities */
 	__u32			vs;	/* Version */
-	__u32			ims;	/* Interrupt Mask Set */
-	__u32			imc;	/* Interrupt Mask Clear */
+	__u32			intms;	/* Interrupt Mask Set */
+	__u32			intmc;	/* Interrupt Mask Clear */
 	__u32			cc;	/* Controller Configuration */
+	__u32			rsvd1;	/* Reserved */
 	__u32			csts;	/* Controller Status */
+	__u32			rsvd2;	/* Reserved */
 	__u32			aqa;	/* Admin Queue Attributes */
 	__u64			asq;	/* Admin SQ Base Address */
 	__u64			acq;	/* Admin CQ Base Address */

From eeee322647a67c20d9277c5e02c42b2126ea74bc Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 14 Feb 2011 15:55:33 -0500
Subject: [PATCH 046/105] NVMe: Handle failures differently in
 nvme_submit_bio_queue()

Return -EBUSY if the queue is full or -ENOMEM if we failed to allocate
memory (or map a scatterlist).  Also use GFP_ATOMIC to allocate the
nvme_bio and move the locking to the callers of nvme_submit_bio_queue().

In nvme_make_request(), don't permit an I/O to jump the queue -- if the
congestion list already has an entry, just add to the tail, rather than
trying to submit.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index c0e84b688f50..61a241741ca6 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -420,17 +420,17 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	struct nvme_command *cmnd;
 	struct nvme_bio *nbio;
 	enum dma_data_direction dma_dir;
-	int cmdid;
+	int cmdid, result = -ENOMEM;
 	u16 control;
 	u32 dsmgmt;
-	unsigned long flags;
 	int psegs = bio_phys_segments(ns->queue, bio);
 
-	nbio = alloc_nbio(psegs, GFP_NOIO);
+	nbio = alloc_nbio(psegs, GFP_ATOMIC);
 	if (!nbio)
-		goto congestion;
+		goto nomem;
 	nbio->bio = bio;
 
+	result = -EBUSY;
 	cmdid = alloc_cmdid(nvmeq, nbio, bio_completion_id, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		goto free_nbio;
@@ -445,7 +445,6 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (bio->bi_rw & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
-	spin_lock_irqsave(&nvmeq->q_lock, flags);
 	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
 	memset(cmnd, 0, sizeof(*cmnd));
@@ -457,8 +456,9 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
+	result = -ENOMEM;
 	if (nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs) == 0)
-		goto mapping_failed;
+		goto free_nbio;
 
 	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
@@ -474,19 +474,12 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 
-	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
-
-	return 0;
-
- mapping_failed:
-	free_nbio(nvmeq, nbio);
-	bio_endio(bio, -ENOMEM);
 	return 0;
 
  free_nbio:
 	free_nbio(nvmeq, nbio);
- congestion:
-	return -EBUSY;
+ nomem:
+	return result;
 }
 
 static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio)
@@ -507,13 +500,18 @@ static int nvme_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_queue *nvmeq = get_nvmeq(ns);
+	int result = -EBUSY;
 
-	if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
-		blk_set_queue_congested(q, rw_is_sync(bio->bi_rw));
-		spin_lock_irq(&nvmeq->q_lock);
+	spin_lock_irq(&nvmeq->q_lock);
+	if (bio_list_empty(&nvmeq->sq_cong))
+		result = nvme_submit_bio_queue(nvmeq, ns, bio);
+	if (unlikely(result)) {
+		if (bio_list_empty(&nvmeq->sq_cong))
+			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
 		bio_list_add(&nvmeq->sq_cong, bio);
-		spin_unlock_irq(&nvmeq->q_lock);
 	}
+
+	spin_unlock_irq(&nvmeq->q_lock);
 	put_nvmeq(nvmeq);
 
 	return 0;

From 1fa6aeadf18aeebd7a217d7a3a933856448375b6 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 2 Mar 2011 18:37:18 -0500
Subject: [PATCH 047/105] NVMe: Add a kthread to handle the congestion list

Instead of trying to resubmit I/Os in the I/O completion path (in
interrupt context), wake up a kthread which will resubmit I/O from
user context.  This allows mke2fs to run to completion.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 86 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 67 insertions(+), 19 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 61a241741ca6..606371e62905 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -26,6 +26,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kdev_t.h>
+#include <linux/kthread.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -50,10 +51,15 @@ module_param(nvme_major, int, 0);
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
+static DEFINE_SPINLOCK(dev_list_lock);
+static LIST_HEAD(dev_list);
+static struct task_struct *nvme_thread;
+
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
 struct nvme_dev {
+	struct list_head node;
 	struct nvme_queue **queues;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
@@ -97,6 +103,7 @@ struct nvme_queue {
 	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
 	wait_queue_head_t sq_full;
+	wait_queue_t sq_cong_wait;
 	struct bio_list sq_cong;
 	u32 __iomem *q_db;
 	u16 q_depth;
@@ -108,8 +115,6 @@ struct nvme_queue {
 	unsigned long cmdid_data[];
 };
 
-static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio);
-
 /*
  * Check we didin't inadvertently grow the command struct
  */
@@ -309,9 +314,6 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	free_nbio(nvmeq, nbio);
 	bio_endio(bio, status ? -EIO : 0);
-	bio = bio_list_pop(&nvmeq->sq_cong);
-	if (bio)
-		nvme_resubmit_bio(nvmeq, bio);
 }
 
 /* length is in bytes */
@@ -482,16 +484,6 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return result;
 }
 
-static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio)
-{
-	struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
-	if (nvme_submit_bio_queue(nvmeq, ns, bio))
-		bio_list_add_head(&nvmeq->sq_cong, bio);
-	else if (bio_list_empty(&nvmeq->sq_cong))
-		blk_clear_queue_congested(ns->queue, rw_is_sync(bio->bi_rw));
-	/* XXX: Need to duplicate the logic from __freed_request here */
-}
-
 /*
  * NB: return value of non-zero would mean that we were a stacking driver.
  * make_request must always succeed.
@@ -774,6 +766,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	init_waitqueue_head(&nvmeq->sq_full);
+	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
 	bio_list_init(&nvmeq->sq_cong);
 	nvmeq->q_db = &dev->dbs[qid * 2];
 	nvmeq->q_depth = depth;
@@ -1097,6 +1090,43 @@ static const struct block_device_operations nvme_fops = {
 	.ioctl		= nvme_ioctl,
 };
 
+static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
+{
+	while (bio_list_peek(&nvmeq->sq_cong)) {
+		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
+		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
+			bio_list_add_head(&nvmeq->sq_cong, bio);
+			break;
+		}
+	}
+}
+
+static int nvme_kthread(void *data)
+{
+	struct nvme_dev *dev;
+
+	while (!kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		spin_lock(&dev_list_lock);
+		list_for_each_entry(dev, &dev_list, node) {
+			int i;
+			for (i = 0; i < dev->queue_count; i++) {
+				struct nvme_queue *nvmeq = dev->queues[i];
+				spin_lock_irq(&nvmeq->q_lock);
+				if (nvme_process_cq(nvmeq))
+					printk("process_cq did something\n");
+				nvme_resubmit_bios(nvmeq);
+				spin_unlock_irq(&nvmeq->q_lock);
+			}
+		}
+		spin_unlock(&dev_list_lock);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ);
+	}
+	return 0;
+}
+
 static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
 			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
 {
@@ -1307,6 +1337,10 @@ static int nvme_dev_remove(struct nvme_dev *dev)
 {
 	struct nvme_ns *ns, *next;
 
+	spin_lock(&dev_list_lock);
+	list_del(&dev->node);
+	spin_unlock(&dev_list_lock);
+
 	/* TODO: wait all I/O finished or cancel them */
 
 	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
@@ -1406,6 +1440,11 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	result = nvme_dev_add(dev);
 	if (result)
 		goto delete;
+
+	spin_lock(&dev_list_lock);
+	list_add(&dev->node, &dev_list);
+	spin_unlock(&dev_list_lock);
+
 	return 0;
 
  delete:
@@ -1479,17 +1518,25 @@ static struct pci_driver nvme_driver = {
 
 static int __init nvme_init(void)
 {
-	int result;
+	int result = -EBUSY;
+
+	nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+	if (IS_ERR(nvme_thread))
+		return PTR_ERR(nvme_thread);
 
 	nvme_major = register_blkdev(nvme_major, "nvme");
 	if (nvme_major <= 0)
-		return -EBUSY;
+		goto kill_kthread;
 
 	result = pci_register_driver(&nvme_driver);
-	if (!result)
-		return 0;
+	if (result)
+		goto unregister_blkdev;
+	return 0;
 
+ unregister_blkdev:
 	unregister_blkdev(nvme_major, "nvme");
+ kill_kthread:
+	kthread_stop(nvme_thread);
 	return result;
 }
 
@@ -1497,6 +1544,7 @@ static void __exit nvme_exit(void)
 {
 	pci_unregister_driver(&nvme_driver);
 	unregister_blkdev(nvme_major, "nvme");
+	kthread_stop(nvme_thread);
 }
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");

From ad8a5df97cb060aa4d817af25587c99e2d2fda97 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 14 Feb 2011 17:35:00 -0500
Subject: [PATCH 048/105] NVMe: Release 0.3

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 606371e62905..7554625fb94d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1549,6 +1549,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.2");
+MODULE_VERSION("0.3");
 module_init(nvme_init);
 module_exit(nvme_exit);

From ca1615424c9adfdbe7d484771d7a7c5ecc4bb6d2 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 15 Feb 2011 13:44:13 -0500
Subject: [PATCH 049/105] NVMe: Remove setting of 'flags' in rw command

This was the data transfer bit until spec rev 0.92

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 7554625fb94d..37cdf0711954 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -462,7 +462,6 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs) == 0)
 		goto free_nbio;
 
-	cmnd->rw.flags = 1;
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,

From b348b7d54368c87811907a8e88f0d96713c43009 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 15 Feb 2011 16:16:02 -0500
Subject: [PATCH 050/105] NVMe: Rename nr_queues to nr_io_queues

I got confused about whether this included the admin queue or not, and
had to resort to reading the spec.  It doesn't include the admin queue,
so make that clear in the name.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 37cdf0711954..f3aa8097e675 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1202,29 +1202,30 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 
 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	int result, cpu, i, nr_queues;
+	int result, cpu, i, nr_io_queues;
 
-	nr_queues = num_online_cpus();
-	result = set_queue_count(dev, nr_queues);
+	nr_io_queues = num_online_cpus();
+	result = set_queue_count(dev, nr_io_queues);
 	if (result < 0)
 		return result;
-	if (result < nr_queues)
-		nr_queues = result;
+	if (result < nr_io_queues)
+		nr_io_queues = result;
 
 	/* Deregister the admin queue's interrupt */
 	free_irq(dev->entry[0].vector, dev->queues[0]);
 
-	for (i = 0; i < nr_queues; i++)
+	for (i = 0; i < nr_io_queues; i++)
 		dev->entry[i].entry = i;
 	for (;;) {
-		result = pci_enable_msix(dev->pci_dev, dev->entry, nr_queues);
+		result = pci_enable_msix(dev->pci_dev, dev->entry,
+								nr_io_queues);
 		if (result == 0) {
 			break;
 		} else if (result > 0) {
-			nr_queues = result;
+			nr_io_queues = result;
 			continue;
 		} else {
-			nr_queues = 1;
+			nr_io_queues = 1;
 			break;
 		}
 	}
@@ -1233,12 +1234,12 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 	/* XXX: handle failure here */
 
 	cpu = cpumask_first(cpu_online_mask);
-	for (i = 0; i < nr_queues; i++) {
+	for (i = 0; i < nr_io_queues; i++) {
 		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
 		cpu = cpumask_next(cpu, cpu_online_mask);
 	}
 
-	for (i = 0; i < nr_queues; i++) {
+	for (i = 0; i < nr_io_queues; i++) {
 		dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
 							NVME_Q_DEPTH, i);
 		if (!dev->queues[i + 1])

From 740216fc59cba54f65187c9ed92f29bce3cf8778 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 15 Feb 2011 16:28:20 -0500
Subject: [PATCH 051/105] NVMe: Let the kthread take care of devices earlier

If interrupts are misconfigured, the kthread will be needed to process
admin queue completions.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f3aa8097e675..df1d8bda8c7c 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1112,6 +1112,8 @@ static int nvme_kthread(void *data)
 			int i;
 			for (i = 0; i < dev->queue_count; i++) {
 				struct nvme_queue *nvmeq = dev->queues[i];
+				if (!nvmeq)
+					continue;
 				spin_lock_irq(&nvmeq->q_lock);
 				if (nvme_process_cq(nvmeq))
 					printk("process_cq did something\n");
@@ -1437,17 +1439,21 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 		goto unmap;
 	dev->queue_count++;
 
-	result = nvme_dev_add(dev);
-	if (result)
-		goto delete;
-
 	spin_lock(&dev_list_lock);
 	list_add(&dev->node, &dev_list);
 	spin_unlock(&dev_list_lock);
 
+	result = nvme_dev_add(dev);
+	if (result)
+		goto delete;
+
 	return 0;
 
  delete:
+	spin_lock(&dev_list_lock);
+	list_del(&dev->node);
+	spin_unlock(&dev_list_lock);
+
 	nvme_free_queues(dev);
  unmap:
 	iounmap(dev->bar);

From 7547881d0951384f9833ec3a80fac8f3f16f3b98 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Feb 2011 09:59:59 -0500
Subject: [PATCH 052/105] NVMe: Correct SQ doorbell semantics

The value written to the doorbell needs to be the first free index in
the queue, not the most recently used index in the queue.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index df1d8bda8c7c..af45e286d5dd 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -246,9 +246,9 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	spin_lock_irqsave(&nvmeq->q_lock, flags);
 	tail = nvmeq->sq_tail;
 	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
-	writel(tail, nvmeq->q_db);
 	if (++tail == nvmeq->q_depth)
 		tail = 0;
+	writel(tail, nvmeq->q_db);
 	nvmeq->sq_tail = tail;
 	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
 
@@ -471,9 +471,9 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
-	writel(nvmeq->sq_tail, nvmeq->q_db);
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
 
 	return 0;
 

From c42705592be2a539f3027b6f3907de8e8f9591a8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 22 Feb 2011 14:15:34 -0500
Subject: [PATCH 053/105] NVMe: Mark CMD_CTX_CANCELLED as being unlikely

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index af45e286d5dd..ce919b49b30d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -518,7 +518,7 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct sync_cmd_info *cmdinfo = ctx;
-	if ((unsigned long)cmdinfo == CMD_CTX_CANCELLED)
+	if (unlikely((unsigned long)cmdinfo == CMD_CTX_CANCELLED))
 		return;
 	if (unlikely((unsigned long)cmdinfo == CMD_CTX_COMPLETED)) {
 		dev_warn(nvmeq->q_dmadev,

From 00df5cb4eb927078850086f8becc3286a69ea12e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 22 Feb 2011 14:18:30 -0500
Subject: [PATCH 054/105] NVMe: Implement Flush

Linux implements Flush as a bit in the bio.  That means there may also be
data associated with the flush; if so the flush should be sent before the
data.  To avoid completing the bio twice, I add CMD_CTX_FLUSH to indicate
the completion routine should do nothing.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index ce919b49b30d..d99b400ccd79 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -191,10 +191,12 @@ enum {
 	bio_completion_id,
 };
 
+/* Special values must be a multiple of 4, and less than 0x1000 */
 #define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
 #define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
+#define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
 
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
@@ -416,6 +418,33 @@ static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 	return dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir);
 }
 
+static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+								int cmdid)
+{
+	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->common.opcode = nvme_cmd_flush;
+	cmnd->common.command_id = cmdid;
+	cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	return 0;
+}
+
+static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
+{
+	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
+						sync_completion_id, IO_TIMEOUT);
+	if (unlikely(cmdid < 0))
+		return cmdid;
+
+	return nvme_submit_flush(nvmeq, ns, cmdid);
+}
+
 static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								struct bio *bio)
 {
@@ -427,6 +456,12 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	u32 dsmgmt;
 	int psegs = bio_phys_segments(ns->queue, bio);
 
+	if ((bio->bi_rw & REQ_FLUSH) && psegs) {
+		result = nvme_submit_flush_data(nvmeq, ns);
+		if (result)
+			return result;
+	}
+
 	nbio = alloc_nbio(psegs, GFP_ATOMIC);
 	if (!nbio)
 		goto nomem;
@@ -437,6 +472,9 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (unlikely(cmdid < 0))
 		goto free_nbio;
 
+	if ((bio->bi_rw & REQ_FLUSH) && !psegs)
+		return nvme_submit_flush(nvmeq, ns, cmdid);
+
 	control = 0;
 	if (bio->bi_rw & REQ_FUA)
 		control |= NVME_RW_FUA;
@@ -520,6 +558,8 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 	struct sync_cmd_info *cmdinfo = ctx;
 	if (unlikely((unsigned long)cmdinfo == CMD_CTX_CANCELLED))
 		return;
+	if ((unsigned long)cmdinfo == CMD_CTX_FLUSH)
+		return;
 	if (unlikely((unsigned long)cmdinfo == CMD_CTX_COMPLETED)) {
 		dev_warn(nvmeq->q_dmadev,
 				"completed id %d twice on queue %d\n",

From 1ad2f8932a72bf375361727949ced2cb4e8cfcef Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 23 Feb 2011 15:20:00 -0500
Subject: [PATCH 055/105] NVMe: Handle bios that contain non-virtually
 contiguous addresses

NVMe scatterlists must be virtually contiguous, like almost all I/Os.
However, when the filesystem lays out files with a hole, it can be that
adjacent LBAs map to non-adjacent virtual addresses.  Handle this by
submitting one NVMe command at a time for each virtually discontiguous
range.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index d99b400ccd79..240922706a93 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -315,7 +315,14 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 	dma_unmap_sg(nvmeq->q_dmadev, nbio->sg, nbio->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	free_nbio(nvmeq, nbio);
-	bio_endio(bio, status ? -EIO : 0);
+	if (status)
+		bio_endio(bio, -EIO);
+	if (bio->bi_vcnt > bio->bi_idx) {
+		bio_list_add(&nvmeq->sq_cong, bio);
+		wake_up_process(nvme_thread);
+	} else {
+		bio_endio(bio, 0);
+	}
 }
 
 /* length is in bytes */
@@ -393,29 +400,41 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 	return prps;
 }
 
+/* NVMe scatterlists require no holes in the virtual address */
+#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)	((vec2)->bv_offset || \
+			(((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
+
 static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
 	struct bio_vec *bvec, *bvprv = NULL;
 	struct scatterlist *sg = NULL;
-	int i, nsegs = 0;
+	int i, old_idx, length = 0, nsegs = 0;
 
 	sg_init_table(nbio->sg, psegs);
+	old_idx = bio->bi_idx;
 	bio_for_each_segment(bvec, bio, i) {
 		if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
 			sg->length += bvec->bv_len;
 		} else {
-			/* Check bvprv && offset == 0 */
+			if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
+				break;
 			sg = sg ? sg + 1 : nbio->sg;
 			sg_set_page(sg, bvec->bv_page, bvec->bv_len,
 							bvec->bv_offset);
 			nsegs++;
 		}
+		length += bvec->bv_len;
 		bvprv = bvec;
 	}
+	bio->bi_idx = i;
 	nbio->nents = nsegs;
 	sg_mark_end(sg);
-	return dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir);
+	if (dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir) == 0) {
+		bio->bi_idx = old_idx;
+		return -ENOMEM;
+	}
+	return length;
 }
 
 static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
@@ -451,7 +470,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	struct nvme_command *cmnd;
 	struct nvme_bio *nbio;
 	enum dma_data_direction dma_dir;
-	int cmdid, result = -ENOMEM;
+	int cmdid, length, result = -ENOMEM;
 	u16 control;
 	u32 dsmgmt;
 	int psegs = bio_phys_segments(ns->queue, bio);
@@ -496,16 +515,17 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
-	result = -ENOMEM;
-	if (nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs) == 0)
+	result = nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs);
+	if (result < 0)
 		goto free_nbio;
+	length = result;
 
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,
-								bio->bi_size);
+								length);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
-	cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
+	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 

From d8ee9d69f275769aaad40ef7c944565ff8d2d24f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 24 Feb 2011 08:46:00 -0500
Subject: [PATCH 056/105] NVMe: Fix discontiguous accesses

When we submit subsequent portions of the I/O, we need to access the
updated block, not start reading again from the original position.
This was showing up as miscompares in the XFS randholes testcase.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 240922706a93..562d75a0fc50 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -529,6 +529,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
+	bio->bi_sector += length >> 9;
+
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
 	writel(nvmeq->sq_tail, nvmeq->q_db);

From e6d15f79f997a98b3a69abbc462fc9041cc1a7b4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 24 Feb 2011 08:49:41 -0500
Subject: [PATCH 057/105] NVMe: Reduce maximum queue depth by 1

The spec says we're not allowed to completely fill the submission queue.
Solve this by reducing the number of allocatable cmdids by 1.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 562d75a0fc50..45bfae1ebd50 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -156,7 +156,7 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
 static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler,
 							unsigned timeout)
 {
-	int depth = nvmeq->q_depth;
+	int depth = nvmeq->q_depth - 1;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 	int cmdid;
 

From b57ab0fada358357571f0eb448cdf2f144785321 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 24 Feb 2011 16:20:14 -0500
Subject: [PATCH 058/105] NVMe: Version 0.4

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 45bfae1ebd50..a8549dff4691 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1617,6 +1617,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.3");
+MODULE_VERSION("0.4");
 module_init(nvme_init);
 module_exit(nvme_exit);

From 2ddc4f74d8adcf3e1cdec7f3e72d19b5c878597c Mon Sep 17 00:00:00 2001
From: Krzysztof Wierzbicki <krzysztof.wierzbicki@intel.com>
Date: Mon, 28 Feb 2011 08:27:13 +0100
Subject: [PATCH 059/105] NVMe: Update admin opcodes to match the 1.0RC spec

Signed-off-by: Krzysztof Wierzbicki <krzysztof.wierzbicki@intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 include/linux/nvme.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 757faa71666e..c46a9b7988fb 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -199,19 +199,19 @@ enum {
 enum nvme_admin_opcode {
 	nvme_admin_delete_sq		= 0x00,
 	nvme_admin_create_sq		= 0x01,
-	nvme_admin_get_features		= 0x02,
+	nvme_admin_get_log_page		= 0x02,
 	nvme_admin_delete_cq		= 0x04,
 	nvme_admin_create_cq		= 0x05,
 	nvme_admin_identify		= 0x06,
 	nvme_admin_abort_cmd		= 0x08,
 	nvme_admin_set_features		= 0x09,
-	nvme_admin_get_log_page		= 0x0a,
+	nvme_admin_get_features		= 0x0a,
 	nvme_admin_async_event		= 0x0c,
-	nvme_admin_download_fw		= 0x0d,
-	nvme_admin_security_recv	= 0x0e,
-	nvme_admin_format_nvm		= 0x10,
-	nvme_admin_security_send	= 0x11,
-	nvme_admin_activate_fw		= 0x14,
+	nvme_admin_activate_fw		= 0x10,
+	nvme_admin_download_fw		= 0x11,
+	nvme_admin_format_nvm		= 0x80,
+	nvme_admin_security_send	= 0x81,
+	nvme_admin_security_recv	= 0x82,
 };
 
 enum {

From 714a7a22884b74862540bc84955274d86b2f6040 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:28:24 -0400
Subject: [PATCH 060/105] NVMe: Convert comments to kernel-doc notation

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index a8549dff4691..e392919e0eac 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -142,10 +142,10 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
 }
 
 /**
- * alloc_cmdid - Allocate a Command ID
- * @param nvmeq The queue that will be used for this command
- * @param ctx A pointer that will be passed to the handler
- * @param handler The ID of the handler to call
+ * alloc_cmdid() - Allocate a Command ID
+ * @nvmeq: The queue that will be used for this command
+ * @ctx: A pointer that will be passed to the handler
+ * @handler: The ID of the handler to call
  *
  * Allocate a Command ID for a queue.  The data passed in will
  * be passed to the completion handler.  This is implemented by using
@@ -234,7 +234,7 @@ static void put_nvmeq(struct nvme_queue *nvmeq)
 }
 
 /**
- * nvme_submit_cmd: Copy a command into a queue and ring the doorbell
+ * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
  * @cmd: The command to send
  *

From fa92282149842645931580225647238428374758 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:29:00 -0400
Subject: [PATCH 061/105] NVMe: Fix comment formatting

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index e392919e0eac..740a9c1b81aa 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -182,7 +182,8 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 	return (cmdid < 0) ? -EINTR : cmdid;
 }
 
-/* If you need more than four handlers, you'll need to change how
+/*
+ * If you need more than four handlers, you'll need to change how
  * alloc_cmdid and nvme_process_cq work.  Consider using a special
  * CMD_CTX value instead, if that works for your situation.
  */
@@ -1066,7 +1067,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	prps = nvme_setup_prps(dev, &c.common, sg, length);
 
 	nvmeq = get_nvmeq(ns);
-	/* Since nvme_submit_sync_cmd sleeps, we can't keep preemption
+	/*
+	 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
 	 * disabled.  We may be preempted at any point, and be rescheduled
 	 * to a different CPU.  That will cause cacheline bouncing, but no
 	 * additional races since q_lock already protects against other CPUs.

From 19e899b2f9f89f4a290dd5c9c24d15987a18ab21 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:29:24 -0400
Subject: [PATCH 062/105] NVMe: Remove outdated comments

The head can never overrun the tail since we won't allocate enough command
IDs to let that happen.  The status codes are in sync with the spec.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 -
 include/linux/nvme.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 740a9c1b81aa..d4f95eb51dc1 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -245,7 +245,6 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 {
 	unsigned long flags;
 	u16 tail;
-	/* XXX: Need to check tail isn't going to overrun head */
 	spin_lock_irqsave(&nvmeq->q_lock, flags);
 	tail = nvmeq->sq_tail;
 	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c46a9b7988fb..6b5a8d19daf5 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -323,7 +323,6 @@ struct nvme_command {
 	};
 };
 
-/* XXX: Sync with spec */
 enum {
 	NVME_SC_SUCCESS			= 0x0,
 	NVME_SC_INVALID_OPCODE		= 0x1,

From ac88c36a385b848cb9efcb877fdfc4153a60bcab Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:29:58 -0400
Subject: [PATCH 063/105] NVMe: Fix interpretation of 'Number of Namespaces'
 field

The spec says this is a 0s based value.  We don't need to handle the
maximal value because it's reserved to mean "every namespace".

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index d4f95eb51dc1..0d5c918b7d59 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1364,7 +1364,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 	crt.features.prp1 = cpu_to_le64(dma_addr + 4096);
 	crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
 
-	for (i = 0; i < nn; i++) {
+	for (i = 0; i <= nn; i++) {
 		cid.identify.nsid = cpu_to_le32(i);
 		res = nvme_submit_admin_cmd(dev, &cid, NULL);
 		if (res)

From 7523d834dd1573610078eb1ac0933f6490232f90 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:43:40 -0400
Subject: [PATCH 064/105] NVMe: Fix off-by-one when filling in PRP lists

If the last element in the PRP list fits on the end of the page, there's
no need to allocate an extra page to put that single element in.  It can
fit on the end of the page.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 0d5c918b7d59..cf89db8c41ee 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -376,12 +376,13 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 	cmd->prp2 = cpu_to_le64(prp_dma);
 	i = 0;
 	for (;;) {
-		if (i == PAGE_SIZE / 8 - 1) {
+		if (i == PAGE_SIZE / 8) {
 			__le64 *old_prp_list = prp_list;
 			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 			prps->list[prp_page++] = prp_list;
-			old_prp_list[i] = cpu_to_le64(prp_dma);
-			i = 0;
+			prp_list[0] = old_prp_list[i - 1];
+			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+			i = 1;
 		}
 		prp_list[i++] = cpu_to_le64(dma_addr);
 		dma_len -= PAGE_SIZE;

From 3cb967c03926edd2c414082f4cc0feb7b372edae Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:45:49 -0400
Subject: [PATCH 065/105] NVMe: Remove the kthread from the wait queue

Once there are no more bios on the congestion list, we can stop waking
up the nvme kthread every time a completion happens.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index cf89db8c41ee..8d3c0b79ac2b 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1162,6 +1162,9 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 			bio_list_add_head(&nvmeq->sq_cong, bio);
 			break;
 		}
+		if (bio_list_empty(&nvmeq->sq_cong))
+			remove_wait_queue(&nvmeq->sq_full,
+							&nvmeq->sq_cong_wait);
 	}
 }
 

From 9ecdc946212f7cd592986b2c519b470404caa6b8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 16 Mar 2011 16:52:19 -0400
Subject: [PATCH 066/105] NVMe: Simplify queue lookup

Fill in all the num_possible_cpus() entries with duplicate pointers.
This reduces the complexity of the frequently-called get_nvmeq(), as
well as avoiding a bug in it when there are fewer queues than CPUs.

Reported-by: Shane Michael Matthews <shane.matthews@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 8d3c0b79ac2b..f94f1731478f 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -221,12 +221,7 @@ static void cancel_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
 {
-	int qid, cpu = get_cpu();
-	if (cpu < ns->dev->queue_count)
-		qid = cpu + 1;
-	else
-		qid = (cpu % rounddown_pow_of_two(ns->dev->queue_count)) + 1;
-	return ns->dev->queues[qid];
+	return ns->dev->queues[get_cpu() + 1];
 }
 
 static void put_nvmeq(struct nvme_queue *nvmeq)
@@ -1316,6 +1311,11 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 		dev->queue_count++;
 	}
 
+	for (; i < num_possible_cpus(); i++) {
+		int target = i % rounddown_pow_of_two(dev->queue_count - 1);
+		dev->queues[i + 1] = dev->queues[target + 1];
+	}
+
 	return 0;
 }
 

From 4948168280b269a514045766ddd872cfac5968e1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sat, 19 Mar 2011 14:55:38 -0400
Subject: [PATCH 067/105] NVMe: Add compat_ioctl

Make ioctls work for 32-bit applications on 64-bit kernels.  The structures
are defined to be the same for both 32- and 64-bit applications, so
we can use the same handler for both.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f94f1731478f..d0b52622e261 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1146,6 +1146,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 static const struct block_device_operations nvme_fops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_ioctl,
 };
 
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)

From 9d4af1b7796ba02b73a79a8694399e5a3cd1c55d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 20 Mar 2011 07:27:10 -0400
Subject: [PATCH 068/105] NVMe: Correct the definitions of two ioctls

NVME_IOCTL_SUBMIT_IO has a struct nvme_user_io, not a struct nvme_rw_command
as a parameter, and NVME_IOCTL_DOWNLOAD_FW is a Write, not a Read.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 include/linux/nvme.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 6b5a8d19daf5..fd10d597cca7 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -395,8 +395,8 @@ struct nvme_dlfw {
 #define NVME_IOCTL_IDENTIFY_NS	_IOW('N', 0x40, struct nvme_id_ns)
 #define NVME_IOCTL_IDENTIFY_CTRL _IOW('N', 0x41, struct nvme_id_ctrl)
 #define NVME_IOCTL_GET_RANGE_TYPE _IOW('N', 0x42, struct nvme_lba_range_type)
-#define NVME_IOCTL_SUBMIT_IO	_IOWR('N', 0x43, struct nvme_rw_command)
-#define NVME_IOCTL_DOWNLOAD_FW	_IOR('N', 0x44, struct nvme_dlfw)
+#define NVME_IOCTL_SUBMIT_IO	_IOWR('N', 0x43, struct nvme_user_io)
+#define NVME_IOCTL_DOWNLOAD_FW	_IOW('N', 0x44, struct nvme_dlfw)
 #define NVME_IOCTL_ACTIVATE_FW	_IO('N', 0x45)
 
 #endif /* _LINUX_NVME_H */

From 6c7d49455ceb63064f992347d9185ff5bf43497a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 21 Mar 2011 09:48:57 -0400
Subject: [PATCH 069/105] NVMe: Change the definition of nvme_user_io

The read and write commands don't define a 'result', so there's no need
to copy it back to userspace.

Remove the ability of the ioctl to submit commands to a different
namespace; it's just asking for trouble, and the use case I have in mind
will be addressed througha  different ioctl in the future.  That removes
the need for both the block_shift and nsid arguments.

Check that the opcode is one of 'read' or 'write'.  Future opcodes may
be added in the future, but we will need a different structure definition
for them.

The nblocks field is redefined to be 0-based.  This allows the user to
request the full 65536 blocks.

Don't byteswap the reftag, apptag and appmask.  Martin Petersen tells
me these are calculated in big-endian and are transmitted to the device
in big-endian.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 27 +++++++++++++++++----------
 include/linux/nvme.h |  8 +++-----
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index d0b52622e261..90a96ec8a596 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1035,29 +1035,37 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	struct nvme_user_io io;
 	struct nvme_command c;
 	unsigned length;
-	u32 result;
 	int nents, status;
 	struct scatterlist *sg;
 	struct nvme_prps *prps;
 
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
-	length = io.nblocks << io.block_shift;
-	nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length, &sg);
+	length = (io.nblocks + 1) << ns->lba_shift;
+
+	switch (io.opcode) {
+	case nvme_cmd_write:
+	case nvme_cmd_read:
+		nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr,
+								length, &sg);
+	default:
+		return -EFAULT;
+	}
+
 	if (nents < 0)
 		return nents;
 
 	memset(&c, 0, sizeof(c));
 	c.rw.opcode = io.opcode;
 	c.rw.flags = io.flags;
-	c.rw.nsid = cpu_to_le32(io.nsid);
+	c.rw.nsid = cpu_to_le32(ns->ns_id);
 	c.rw.slba = cpu_to_le64(io.slba);
-	c.rw.length = cpu_to_le16(io.nblocks - 1);
+	c.rw.length = cpu_to_le16(io.nblocks);
 	c.rw.control = cpu_to_le16(io.control);
 	c.rw.dsmgmt = cpu_to_le16(io.dsmgmt);
-	c.rw.reftag = cpu_to_le32(io.reftag);	/* XXX: endian? */
-	c.rw.apptag = cpu_to_le16(io.apptag);
-	c.rw.appmask = cpu_to_le16(io.appmask);
+	c.rw.reftag = io.reftag;
+	c.rw.apptag = io.apptag;
+	c.rw.appmask = io.appmask;
 	/* XXX: metadata */
 	prps = nvme_setup_prps(dev, &c.common, sg, length);
 
@@ -1069,11 +1077,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	 * additional races since q_lock already protects against other CPUs.
 	 */
 	put_nvmeq(nvmeq);
-	status = nvme_submit_sync_cmd(nvmeq, &c, &result, IO_TIMEOUT);
+	status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
 	nvme_free_prps(dev, prps);
-	put_user(result, &uio->result);
 	return status;
 }
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index fd10d597cca7..347ad5f9a721 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -373,17 +373,15 @@ struct nvme_user_io {
 	__u8	opcode;
 	__u8	flags;
 	__u16	control;
-	__u32	nsid;
+	__u16	nblocks;
+	__u16	rsvd;
 	__u64	metadata;
 	__u64	addr;
 	__u64	slba;
-	__u16	nblocks;
-	__u16	block_shift;
 	__u32	dsmgmt;
 	__u32	reftag;
 	__u16	apptag;
 	__u16	appmask;
-	__u32	result;
 };
 
 struct nvme_dlfw {
@@ -395,7 +393,7 @@ struct nvme_dlfw {
 #define NVME_IOCTL_IDENTIFY_NS	_IOW('N', 0x40, struct nvme_id_ns)
 #define NVME_IOCTL_IDENTIFY_CTRL _IOW('N', 0x41, struct nvme_id_ctrl)
 #define NVME_IOCTL_GET_RANGE_TYPE _IOW('N', 0x42, struct nvme_lba_range_type)
-#define NVME_IOCTL_SUBMIT_IO	_IOWR('N', 0x43, struct nvme_user_io)
+#define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x43, struct nvme_user_io)
 #define NVME_IOCTL_DOWNLOAD_FW	_IOW('N', 0x44, struct nvme_dlfw)
 #define NVME_IOCTL_ACTIVATE_FW	_IO('N', 0x45)
 

From 8ef700678f65e2eef1c3a94cdedb79d757608392 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 21 Mar 2011 10:28:43 -0400
Subject: [PATCH 070/105] NVMe: Version 0.5

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 90a96ec8a596..d3eeca5a3c4c 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1630,6 +1630,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.4");
+MODULE_VERSION("0.5");
 module_init(nvme_init);
 module_exit(nvme_exit);

From 7f53f9d2424533256ae86f7df5661a17de743de8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 22 Mar 2011 15:55:45 -0400
Subject: [PATCH 071/105] NVMe: Correct the Controller Configuration settings

The arbitration field was extended by one bit, shifting the shutdown
notification bits by one.  Also, the SQ/CQ entry size was made
configurable for future extensions.

Reported-by: Paul Luse <paul.e.luse@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c |  1 +
 include/linux/nvme.h | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index d3eeca5a3c4c..014a7f6e39bc 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -905,6 +905,7 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
 	dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
 	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+	dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 
 	writel(0, &dev->bar->cc);
 	writel(aqa, &dev->bar->aqa);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 347ad5f9a721..9d6febb91521 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -41,10 +41,12 @@ enum {
 	NVME_CC_MPS_SHIFT	= 7,
 	NVME_CC_ARB_RR		= 0 << 11,
 	NVME_CC_ARB_WRRU	= 1 << 11,
-	NVME_CC_ARB_VS		= 3 << 11,
-	NVME_CC_SHN_NONE	= 0 << 13,
-	NVME_CC_SHN_NORMAL	= 1 << 13,
-	NVME_CC_SHN_ABRUPT	= 2 << 13,
+	NVME_CC_ARB_VS		= 7 << 11,
+	NVME_CC_SHN_NONE	= 0 << 14,
+	NVME_CC_SHN_NORMAL	= 1 << 14,
+	NVME_CC_SHN_ABRUPT	= 2 << 14,
+	NVME_CC_IOSQES		= 6 << 16,
+	NVME_CC_IOCQES		= 4 << 20,
 	NVME_CSTS_RDY		= 1 << 0,
 	NVME_CSTS_CFS		= 1 << 1,
 	NVME_CSTS_SHST_NORMAL	= 0 << 2,

From aba2080f3f1639f9202f1a52993669844abcfb80 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sun, 27 Mar 2011 08:52:06 -0400
Subject: [PATCH 072/105] NVMe: Fix warning in free_irq

We need to clear the affinity mask before calling free_irq()

Reported-by: Shane Michael Matthews <shane.matthews@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 014a7f6e39bc..bcc780ac4ec0 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -781,8 +781,10 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 static void nvme_free_queue(struct nvme_dev *dev, int qid)
 {
 	struct nvme_queue *nvmeq = dev->queues[qid];
+	int vector = dev->entry[nvmeq->cq_vector].vector;
 
-	free_irq(dev->entry[nvmeq->cq_vector].vector, nvmeq);
+	irq_set_affinity_hint(vector, NULL);
+	free_irq(vector, nvmeq);
 
 	/* Don't tell the adapter to delete the admin queue */
 	if (qid) {

From 22605f96810d073eb74051d0295b6577d6a6a563 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 19 Apr 2011 15:04:20 -0400
Subject: [PATCH 073/105] NVMe: Time out initialisation after a few seconds

THe device reports (in its capability register) how long it will take
to initialise.  If that time elapses before the ready bit becomes set,
conclude the device is broken and refuse to initialise it.  Log a nice
error message so the user knows why we did nothing.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 10 ++++++++++
 include/linux/nvme.h |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index bcc780ac4ec0..57f2b33a47dd 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -893,6 +893,8 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 {
 	int result;
 	u32 aqa;
+	u64 cap;
+	unsigned long timeout;
 	struct nvme_queue *nvmeq;
 
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
@@ -915,10 +917,18 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
 	writel(dev->ctrl_config, &dev->bar->cc);
 
+	cap = readq(&dev->bar->cap);
+	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+
 	while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
 		msleep(100);
 		if (fatal_signal_pending(current))
 			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&dev->pci_dev->dev,
+				"Device not ready; aborting initialisation\n");
+			return -ENODEV;
+		}
 	}
 
 	result = queue_request_irq(dev, nvmeq, "nvme admin");
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9d6febb91521..a19304fefa7d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -35,6 +35,8 @@ struct nvme_bar {
 	__u64			acq;	/* Admin CQ Base Address */
 };
 
+#define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
+
 enum {
 	NVME_CC_ENABLE		= 1 << 0,
 	NVME_CC_CSS_NVM		= 0 << 4,

From 09a58f536436efed02ead722e835cb4ce7674afc Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 28 Apr 2011 23:09:09 -0700
Subject: [PATCH 074/105] NVMe: Fix bug in error handling

When an I/O completed with an error, we would call bio_endio twice
(once with -EIO and once with 0).  Found by inspection.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 57f2b33a47dd..205405e7f6b0 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -310,9 +310,9 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 	dma_unmap_sg(nvmeq->q_dmadev, nbio->sg, nbio->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	free_nbio(nvmeq, nbio);
-	if (status)
+	if (status) {
 		bio_endio(bio, -EIO);
-	if (bio->bi_vcnt > bio->bi_idx) {
+	} else if (bio->bi_vcnt > bio->bi_idx) {
 		bio_list_add(&nvmeq->sq_cong, bio);
 		wake_up_process(nvme_thread);
 	} else {

From 21075bdee0a6f56058920d889df4ae561bfed754 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 28 Apr 2011 23:17:36 -0700
Subject: [PATCH 075/105] NVMe: Rename cancel_cmdid_data to cancel_cmdid

The trailing '_data' on the end was annoying and inconsistent.  Also, make
it actually return the data since this is needed for timing out commands.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 205405e7f6b0..9ca9db903ceb 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -213,10 +213,13 @@ static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 	return data;
 }
 
-static void cancel_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
+static unsigned long cancel_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
+	unsigned long data;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	data = info[cmdid].ctx;
 	info[cmdid].ctx = CMD_CTX_CANCELLED;
+	return data;
 }
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
@@ -667,7 +670,7 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 {
 	spin_lock_irq(&nvmeq->q_lock);
-	cancel_cmdid_data(nvmeq, cmdid);
+	cancel_cmdid(nvmeq, cmdid);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 

From 8de055350fbaa96b6563892c195a60be583faa9c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 12 May 2011 13:50:28 -0400
Subject: [PATCH 076/105] NVMe: Add support for timing out I/Os

In the kthread, walk the list of outstanding I/Os and check they've not
hit the timeout.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 9ca9db903ceb..9c0ab2af0fae 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -18,6 +18,7 @@
 
 #include <linux/nvme.h>
 #include <linux/bio.h>
+#include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -601,15 +602,15 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
 typedef void (*completion_fn)(struct nvme_queue *, void *,
 						struct nvme_completion *);
 
+static const completion_fn nvme_completions[4] = {
+	[sync_completion_id] = sync_completion,
+	[bio_completion_id]  = bio_completion,
+};
+
 static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 {
 	u16 head, phase;
 
-	static const completion_fn completions[4] = {
-		[sync_completion_id] = sync_completion,
-		[bio_completion_id]  = bio_completion,
-	};
-
 	head = nvmeq->cq_head;
 	phase = nvmeq->cq_phase;
 
@@ -629,7 +630,7 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 		data = free_cmdid(nvmeq, cqe.command_id);
 		handler = data & 3;
 		ptr = (void *)(data & ~3UL);
-		completions[handler](nvmeq, ptr, &cqe);
+		nvme_completions[handler](nvmeq, ptr, &cqe);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -1172,6 +1173,29 @@ static const struct block_device_operations nvme_fops = {
 	.compat_ioctl	= nvme_ioctl,
 };
 
+static void nvme_timeout_ios(struct nvme_queue *nvmeq)
+{
+	int depth = nvmeq->q_depth - 1;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	unsigned long now = jiffies;
+	int cmdid;
+
+	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
+		unsigned long data;
+		void *ptr;
+		unsigned char handler;
+		static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
+
+		if (!time_after(now, info[cmdid].timeout))
+			continue;
+		dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
+		data = cancel_cmdid(nvmeq, cmdid);
+		handler = data & 3;
+		ptr = (void *)(data & ~3UL);
+		nvme_completions[handler](nvmeq, ptr, &cqe);
+	}
+}
+
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 {
 	while (bio_list_peek(&nvmeq->sq_cong)) {
@@ -1203,6 +1227,7 @@ static int nvme_kthread(void *data)
 				spin_lock_irq(&nvmeq->q_lock);
 				if (nvme_process_cq(nvmeq))
 					printk("process_cq did something\n");
+				nvme_timeout_ios(nvmeq);
 				nvme_resubmit_bios(nvmeq);
 				spin_unlock_irq(&nvmeq->q_lock);
 			}

From fd63e9ceeeae58cfe877c2d49d41c1bf7532303c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 6 May 2011 08:37:54 -0400
Subject: [PATCH 077/105] NVMe: Add include of delay.h for msleep

Previously it was being implicitly included through some other header file

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 9c0ab2af0fae..b285a7e0624d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -20,6 +20,7 @@
 #include <linux/bio.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
+#include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>

From 5aff9382ddc8aac6eb0c70ffbb351652d71da69a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 6 May 2011 08:45:47 -0400
Subject: [PATCH 078/105] NVMe: Use an IDA to allocate minor numbers

The current approach of using the namespace ID as the minor number
doesn't work when there are multiple adapters in the machine.  Rather
than statically partitioning the number of namespaces between adapters,
dynamically allocate minor numbers to namespaces as they are detected.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b285a7e0624d..79012c53ae9c 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -24,6 +24,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>
+#include <linux/idr.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
@@ -1240,7 +1241,34 @@ static int nvme_kthread(void *data)
 	return 0;
 }
 
-static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
+static DEFINE_IDA(nvme_index_ida);
+
+static int nvme_get_ns_idx(void)
+{
+	int index, error;
+
+	do {
+		if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL))
+			return -1;
+
+		spin_lock(&dev_list_lock);
+		error = ida_get_new(&nvme_index_ida, &index);
+		spin_unlock(&dev_list_lock);
+	} while (error == -EAGAIN);
+
+	if (error)
+		index = -1;
+	return index;
+}
+
+static void nvme_put_ns_idx(int index)
+{
+	spin_lock(&dev_list_lock);
+	ida_remove(&nvme_index_ida, index);
+	spin_unlock(&dev_list_lock);
+}
+
+static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
 {
 	struct nvme_ns *ns;
@@ -1265,19 +1293,19 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
 	disk = alloc_disk(NVME_MINORS);
 	if (!disk)
 		goto out_free_queue;
-	ns->ns_id = index;
+	ns->ns_id = nsid;
 	ns->disk = disk;
 	lbaf = id->flbas & 0xf;
 	ns->lba_shift = id->lbaf[lbaf].ds;
 
 	disk->major = nvme_major;
 	disk->minors = NVME_MINORS;
-	disk->first_minor = NVME_MINORS * index;
+	disk->first_minor = NVME_MINORS * nvme_get_ns_idx();
 	disk->fops = &nvme_fops;
 	disk->private_data = ns;
 	disk->queue = ns->queue;
 	disk->driverfs_dev = &dev->pci_dev->dev;
-	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, index);
+	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
 	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
 
 	return ns;
@@ -1291,7 +1319,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
 
 static void nvme_ns_free(struct nvme_ns *ns)
 {
+	int index = ns->disk->first_minor / NVME_MINORS;
 	put_disk(ns->disk);
+	nvme_put_ns_idx(index);
 	blk_cleanup_queue(ns->queue);
 	kfree(ns);
 }

From b77954cbddff28d55a36fad3c16f4daebb0f01df Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 12 May 2011 13:51:41 -0400
Subject: [PATCH 079/105] NVMe: Handle failures from memory allocations in
 nvme_setup_prps

If any of the memory allocations in nvme_setup_prps fail, handle it by
modifying the passed-in data length to reflect the number of bytes we are
actually able to send.  Also allow the caller to specify the GFP flags
they need; for user-initiated commands, we can use GFP_KERNEL allocations.

The various callers are updated to handle this possibility; the main
I/O path is already prepared for this possibility (as it may happen
due to nvme_map_bio being unable to map all the segments of the I/O).
The other callers return -ENOMEM instead of doing partial I/Os.

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 56 ++++++++++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 79012c53ae9c..ddc21ba24a70 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -329,9 +329,11 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 /* length is in bytes */
 static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 					struct nvme_common_command *cmd,
-					struct scatterlist *sg, int length)
+					struct scatterlist *sg, int *len,
+					gfp_t gfp)
 {
 	struct dma_pool *pool;
+	int length = *len;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = offset_in_page(dma_addr);
@@ -361,7 +363,12 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 
 	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
 	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE);
-	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, GFP_ATOMIC);
+	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, gfp);
+	if (!prps) {
+		cmd->prp2 = cpu_to_le64(dma_addr);
+		*len = (*len - length) + PAGE_SIZE;
+		return prps;
+	}
 	prp_page = 0;
 	if (nprps <= (256 / 8)) {
 		pool = dev->prp_small_pool;
@@ -371,7 +378,13 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 		prps->npages = npages;
 	}
 
-	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+	if (!prp_list) {
+		cmd->prp2 = cpu_to_le64(dma_addr);
+		*len = (*len - length) + PAGE_SIZE;
+		kfree(prps);
+		return NULL;
+	}
 	prps->list[prp_page++] = prp_list;
 	prps->first_dma = prp_dma;
 	cmd->prp2 = cpu_to_le64(prp_dma);
@@ -379,7 +392,11 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 	for (;;) {
 		if (i == PAGE_SIZE / 8) {
 			__le64 *old_prp_list = prp_list;
-			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+			prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+			if (!prp_list) {
+				*len = (*len - length);
+				return prps;
+			}
 			prps->list[prp_page++] = prp_list;
 			prp_list[0] = old_prp_list[i - 1];
 			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
@@ -525,7 +542,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,
-								length);
+							&length, GFP_ATOMIC);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
 	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
@@ -1009,15 +1026,18 @@ static int nvme_submit_user_admin_command(struct nvme_dev *dev,
 					unsigned long addr, unsigned length,
 					struct nvme_command *cmd)
 {
-	int err, nents;
+	int err, nents, tmplen = length;
 	struct scatterlist *sg;
 	struct nvme_prps *prps;
 
 	nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
 	if (nents < 0)
 		return nents;
-	prps = nvme_setup_prps(dev, &cmd->common, sg, length);
-	err = nvme_submit_admin_cmd(dev, cmd, NULL);
+	prps = nvme_setup_prps(dev, &cmd->common, sg, &tmplen, GFP_KERNEL);
+	if (tmplen != length)
+		err = -ENOMEM;
+	else
+		err = nvme_submit_admin_cmd(dev, cmd, NULL);
 	nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
 	nvme_free_prps(dev, prps);
 	return err ? -EIO : 0;
@@ -1086,7 +1106,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.apptag = io.apptag;
 	c.rw.appmask = io.appmask;
 	/* XXX: metadata */
-	prps = nvme_setup_prps(dev, &c.common, sg, length);
+	prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
 
 	nvmeq = get_nvmeq(ns);
 	/*
@@ -1096,7 +1116,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	 * additional races since q_lock already protects against other CPUs.
 	 */
 	put_nvmeq(nvmeq);
-	status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
+	if (length != (io.nblocks + 1) << ns->lba_shift)
+		status = -ENOMEM;
+	else
+		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
 	nvme_free_prps(dev, prps);
@@ -1109,7 +1132,7 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_dlfw dlfw;
 	struct nvme_command c;
-	int nents, status;
+	int nents, status, length;
 	struct scatterlist *sg;
 	struct nvme_prps *prps;
 
@@ -1117,8 +1140,9 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 		return -EFAULT;
 	if (dlfw.length >= (1 << 30))
 		return -EINVAL;
+	length = dlfw.length * 4;
 
-	nents = nvme_map_user_pages(dev, 1, dlfw.addr, dlfw.length * 4, &sg);
+	nents = nvme_map_user_pages(dev, 1, dlfw.addr, length, &sg);
 	if (nents < 0)
 		return nents;
 
@@ -1126,9 +1150,11 @@ static int nvme_download_firmware(struct nvme_ns *ns,
 	c.dlfw.opcode = nvme_admin_download_fw;
 	c.dlfw.numd = cpu_to_le32(dlfw.length);
 	c.dlfw.offset = cpu_to_le32(dlfw.offset);
-	prps = nvme_setup_prps(dev, &c.common, sg, dlfw.length * 4);
-
-	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
+	if (length != dlfw.length * 4)
+		status = -ENOMEM;
+	else
+		status = nvme_submit_admin_cmd(dev, &c, NULL);
 	nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
 	nvme_free_prps(dev, prps);
 	return status;

From 184d2944cb3b92a2e8e1733c59d1e531ad6e924a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 11 May 2011 21:36:38 -0400
Subject: [PATCH 080/105] NVMe: Add a few calling convention notes

For the benefit of reviewers, add comments to a few functions describing
their calling context

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index ddc21ba24a70..12062c108bd9 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -155,6 +155,9 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
  * the bottom two bits of the ctx pointer to store the handler ID.
  * Passing in a pointer that's not 4-byte aligned will cause a BUG.
  * We can change this if it becomes a problem.
+ *
+ * May be called with local interrupts disabled and the q_lock held,
+ * or with interrupts enabled and no locks held.
  */
 static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler,
 							unsigned timeout)
@@ -202,6 +205,9 @@ enum {
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
 #define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
 
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
 static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
 {
 	unsigned long data;
@@ -326,7 +332,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 	}
 }
 
-/* length is in bytes */
+/* length is in bytes.  gfp flags indicates whether we may sleep. */
 static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 					struct nvme_common_command *cmd,
 					struct scatterlist *sg, int *len,
@@ -483,6 +489,9 @@ static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
 	return nvme_submit_flush(nvmeq, ns, cmdid);
 }
 
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
 static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								struct bio *bio)
 {

From be5e09484078e95af20acb13e215cd8aec705893 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 11 May 2011 21:38:57 -0400
Subject: [PATCH 081/105] NVMe: Version 0.6

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 12062c108bd9..d1cd91becdb0 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1737,6 +1737,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.5");
+MODULE_VERSION("0.6");
 module_init(nvme_init);
 module_exit(nvme_exit);

From 6f0f54499f2edf7e25410cdd99e6f030f3485fd1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 11 May 2011 13:30:59 -0700
Subject: [PATCH 082/105] NVMe: Return real error from nvme_create_queue

nvme_setup_io_queues() was assuming that a NULL return from
nvme_create_queue() was an out-of-memory error.  That's not necessarily
true; the adapter might return -EIO, for example.  Change the calling
convention to return an ERR_PTR on failure instead of NULL.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index d1cd91becdb0..843edbd79c56 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -892,7 +892,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 	struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
 
 	if (!nvmeq)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	result = adapter_alloc_cq(dev, qid, nvmeq);
 	if (result < 0)
@@ -918,7 +918,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
 					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
 	kfree(nvmeq);
-	return NULL;
+	return ERR_PTR(result);
 }
 
 static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
@@ -1421,8 +1421,8 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 	for (i = 0; i < nr_io_queues; i++) {
 		dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
 							NVME_Q_DEPTH, i);
-		if (!dev->queues[i + 1])
-			return -ENOMEM;
+		if (IS_ERR(dev->queues[i + 1]))
+			return PTR_ERR(dev->queues[i + 1]);
 		dev->queue_count++;
 	}
 

From eac623ba7a91474a688eb5d0fcd0eaa6a56dc41c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 20 May 2011 09:34:43 -0400
Subject: [PATCH 083/105] NVMe: Add the nvme thread to the wait queue before
 waking it up

If the I/O was not completed by a single NVMe command, we add the
bio to the congestion list and wake up the kthread to resubmit it.
But the kthread calls remove_wait_queue() unconditionally, which
will oops if it's not on the wait queue.  So add the kthread to
the wait queue before waking it up.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 843edbd79c56..f5e51a6116e3 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -325,6 +325,8 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
 	if (status) {
 		bio_endio(bio, -EIO);
 	} else if (bio->bi_vcnt > bio->bi_idx) {
+		if (bio_list_empty(&nvmeq->sq_cong))
+			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
 		bio_list_add(&nvmeq->sq_cong, bio);
 		wake_up_process(nvme_thread);
 	} else {

From 6bbf1acddeed0bfb345a5578f9fcada16f1e514f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 20 May 2011 13:03:42 -0400
Subject: [PATCH 084/105] NVMe: Rework ioctls

Remove the special-purpose IDENTIFY, GET_RANGE_TYPE, DOWNLOAD_FIRMWARE
and ACTIVATE_FIRMWARE commands.  Replace them with a generic ADMIN_CMD
ioctl that can submit any admin command.

Add a new ID ioctl that returns the namespace ID of the queried device.
It corresponds to the SCSI Idlun ioctl.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 128 ++++++++++++++-----------------------------
 include/linux/nvme.h |  34 ++++++++----
 2 files changed, 63 insertions(+), 99 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f5e51a6116e3..9e3c724b95c3 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1033,51 +1033,6 @@ static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
 		put_page(sg_page(&sg[i]));
 }
 
-static int nvme_submit_user_admin_command(struct nvme_dev *dev,
-					unsigned long addr, unsigned length,
-					struct nvme_command *cmd)
-{
-	int err, nents, tmplen = length;
-	struct scatterlist *sg;
-	struct nvme_prps *prps;
-
-	nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
-	if (nents < 0)
-		return nents;
-	prps = nvme_setup_prps(dev, &cmd->common, sg, &tmplen, GFP_KERNEL);
-	if (tmplen != length)
-		err = -ENOMEM;
-	else
-		err = nvme_submit_admin_cmd(dev, cmd, NULL);
-	nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
-	nvme_free_prps(dev, prps);
-	return err ? -EIO : 0;
-}
-
-static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
-{
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.identify.opcode = nvme_admin_identify;
-	c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
-	c.identify.cns = cpu_to_le32(cns);
-
-	return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
-}
-
-static int nvme_get_range_type(struct nvme_ns *ns, unsigned long addr)
-{
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.features.opcode = nvme_admin_get_features;
-	c.features.nsid = cpu_to_le32(ns->ns_id);
-	c.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
-
-	return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
-}
-
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
 	struct nvme_dev *dev = ns->dev;
@@ -1096,10 +1051,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	switch (io.opcode) {
 	case nvme_cmd_write:
 	case nvme_cmd_read:
+	case nvme_cmd_compare:
 		nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr,
 								length, &sg);
 	default:
-		return -EFAULT;
+		return -EINVAL;
 	}
 
 	if (nents < 0)
@@ -1137,70 +1093,66 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	return status;
 }
 
-static int nvme_download_firmware(struct nvme_ns *ns,
-						struct nvme_dlfw __user *udlfw)
+static int nvme_user_admin_cmd(struct nvme_ns *ns,
+					struct nvme_admin_cmd __user *ucmd)
 {
 	struct nvme_dev *dev = ns->dev;
-	struct nvme_dlfw dlfw;
+	struct nvme_admin_cmd cmd;
 	struct nvme_command c;
-	int nents, status, length;
+	int status, length, nents = 0;
 	struct scatterlist *sg;
-	struct nvme_prps *prps;
+	struct nvme_prps *prps = NULL;
 
-	if (copy_from_user(&dlfw, udlfw, sizeof(dlfw)))
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
 		return -EFAULT;
-	if (dlfw.length >= (1 << 30))
-		return -EINVAL;
-	length = dlfw.length * 4;
-
-	nents = nvme_map_user_pages(dev, 1, dlfw.addr, length, &sg);
-	if (nents < 0)
-		return nents;
 
 	memset(&c, 0, sizeof(c));
-	c.dlfw.opcode = nvme_admin_download_fw;
-	c.dlfw.numd = cpu_to_le32(dlfw.length);
-	c.dlfw.offset = cpu_to_le32(dlfw.offset);
-	prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
-	if (length != dlfw.length * 4)
+	c.common.opcode = cmd.opcode;
+	c.common.flags = cmd.flags;
+	c.common.nsid = cpu_to_le32(cmd.nsid);
+	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+	length = cmd.data_len;
+	if (cmd.data_len) {
+		nents = nvme_map_user_pages(dev, 1, cmd.addr, length, &sg);
+		if (nents < 0)
+			return nents;
+		prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
+	}
+
+	if (length != cmd.data_len)
 		status = -ENOMEM;
 	else
 		status = nvme_submit_admin_cmd(dev, &c, NULL);
-	nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
-	nvme_free_prps(dev, prps);
+	if (cmd.data_len) {
+		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, sg,
+									nents);
+		nvme_free_prps(dev, prps);
+	}
 	return status;
 }
 
-static int nvme_activate_firmware(struct nvme_ns *ns, unsigned long arg)
-{
-	struct nvme_dev *dev = ns->dev;
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = nvme_admin_activate_fw;
-	c.common.rsvd10[0] = cpu_to_le32(arg);
-
-	return nvme_submit_admin_cmd(dev, &c, NULL);
-}
-
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 							unsigned long arg)
 {
 	struct nvme_ns *ns = bdev->bd_disk->private_data;
 
 	switch (cmd) {
-	case NVME_IOCTL_IDENTIFY_NS:
-		return nvme_identify(ns, arg, 0);
-	case NVME_IOCTL_IDENTIFY_CTRL:
-		return nvme_identify(ns, arg, 1);
-	case NVME_IOCTL_GET_RANGE_TYPE:
-		return nvme_get_range_type(ns, arg);
+	case NVME_IOCTL_ID:
+		return ns->ns_id;
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_admin_cmd(ns, (void __user *)arg);
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, (void __user *)arg);
-	case NVME_IOCTL_DOWNLOAD_FW:
-		return nvme_download_firmware(ns, (void __user *)arg);
-	case NVME_IOCTL_ACTIVATE_FW:
-		return nvme_activate_firmware(ns, arg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index a19304fefa7d..c96ab0f5ef6f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -153,11 +153,11 @@ struct nvme_common_command {
 	__u8			flags;
 	__u16			command_id;
 	__le32			nsid;
-	__u64			rsvd2;
+	__u32			cdw2[2];
 	__le64			metadata;
 	__le64			prp1;
 	__le64			prp2;
-	__u32			rsvd10[6];
+	__u32			cdw10[6];
 };
 
 struct nvme_rw_command {
@@ -388,17 +388,29 @@ struct nvme_user_io {
 	__u16	appmask;
 };
 
-struct nvme_dlfw {
+struct nvme_admin_cmd {
+	__u8	opcode;
+	__u8	flags;
+	__u16	rsvd1;
+	__u32	nsid;
+	__u32	cdw2;
+	__u32	cdw3;
+	__u64	metadata;
 	__u64	addr;
-	__u32	length;	/* In dwords */
-	__u32	offset;	/* In dwords */
+	__u32	metadata_len;
+	__u32	data_len;
+	__u32	cdw10;
+	__u32	cdw11;
+	__u32	cdw12;
+	__u32	cdw13;
+	__u32	cdw14;
+	__u32	cdw15;
+	__u32	timeout_ms;
+	__u32	result;
 };
 
-#define NVME_IOCTL_IDENTIFY_NS	_IOW('N', 0x40, struct nvme_id_ns)
-#define NVME_IOCTL_IDENTIFY_CTRL _IOW('N', 0x41, struct nvme_id_ctrl)
-#define NVME_IOCTL_GET_RANGE_TYPE _IOW('N', 0x42, struct nvme_lba_range_type)
-#define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x43, struct nvme_user_io)
-#define NVME_IOCTL_DOWNLOAD_FW	_IOW('N', 0x44, struct nvme_dlfw)
-#define NVME_IOCTL_ACTIVATE_FW	_IO('N', 0x45)
+#define NVME_IOCTL_ID		_IO('N', 0x40)
+#define NVME_IOCTL_ADMIN_CMD	_IOWR('N', 0x41, struct nvme_admin_cmd)
+#define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x42, struct nvme_user_io)
 
 #endif /* _LINUX_NVME_H */

From 6413214c5d424fd5aae6567848340f962ad2ce0f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 9 Aug 2011 12:56:37 -0400
Subject: [PATCH 085/105] Fix bug in NVME_IOCTL_SUBMIT_IO

Missing 'break' in the switch statement meant that we'd fall through
to the 'return -EINVAL' case.
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 9e3c724b95c3..0956e1241520 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1054,6 +1054,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	case nvme_cmd_compare:
 		nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr,
 								length, &sg);
+		break;
 	default:
 		return -EINVAL;
 	}

From d0ba1e497bca83a3d353eb47c9658afc54d83228 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 13 Sep 2011 17:01:39 -0400
Subject: [PATCH 086/105] NVMe: Correct sg list setup in nvme_map_user_pages

Our SG list was constructed to always fill the entire first page, even
if that was more than the length of the I/O.  This is probably harmless,
but some IOMMUs might do something bad.

Correcting the first call to sg_set_page() made it look a lot closer to
the sg_set_page() in the loop, so fold the first call to sg_set_page()
into the loop.

Reported-by: Nisheeth Bhat <nisheeth.bhat@intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 drivers/block/nvme.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 0956e1241520..5843409cac6d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -996,11 +996,11 @@ static int nvme_map_user_pages(struct nvme_dev *dev, int write,
 
 	sg = kcalloc(count, sizeof(*sg), GFP_KERNEL);
 	sg_init_table(sg, count);
-	sg_set_page(&sg[0], pages[0], PAGE_SIZE - offset, offset);
-	length -= (PAGE_SIZE - offset);
-	for (i = 1; i < count; i++) {
-		sg_set_page(&sg[i], pages[i], min_t(int, length, PAGE_SIZE), 0);
-		length -= PAGE_SIZE;
+	for (i = 0; i < count; i++) {
+		sg_set_page(&sg[i], pages[i],
+				min_t(int, length, PAGE_SIZE - offset), offset);
+		length -= (PAGE_SIZE - offset);
+		offset = 0;
 	}
 
 	err = -ENOMEM;

From d1a490e026efb22851ed60588b5fad1281d80ec3 Mon Sep 17 00:00:00 2001
From: Nisheeth Bhat <nisheeth.bhat@intel.com>
Date: Thu, 15 Sep 2011 16:52:24 -0400
Subject: [PATCH 087/105] NVMe: Fix calls to dma_unmap_sg

dma_unmap_sg() must be called with the same 'nents' passed to
dma_map_sg(), not the number returned from dma_map_sg().

Signed-off-by: Nisheeth Bhat <nisheeth.bhat@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 5843409cac6d..a7f82fbdaf87 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1021,13 +1021,12 @@ static int nvme_map_user_pages(struct nvme_dev *dev, int write,
 }
 
 static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-				unsigned long addr, int length,
-				struct scatterlist *sg, int nents)
+			unsigned long addr, int length, struct scatterlist *sg)
 {
 	int i, count;
 
 	count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE);
-	dma_unmap_sg(&dev->pci_dev->dev, sg, nents, DMA_FROM_DEVICE);
+	dma_unmap_sg(&dev->pci_dev->dev, sg, count, DMA_FROM_DEVICE);
 
 	for (i = 0; i < count; i++)
 		put_page(sg_page(&sg[i]));
@@ -1089,7 +1088,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	else
 		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
 
-	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
+	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg);
 	nvme_free_prps(dev, prps);
 	return status;
 }
@@ -1135,8 +1134,7 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 	else
 		status = nvme_submit_admin_cmd(dev, &c, NULL);
 	if (cmd.data_len) {
-		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, sg,
-									nents);
+		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, sg);
 		nvme_free_prps(dev, prps);
 	}
 	return status;

From 684f5c2025b067a23722e620d0b3b858d8dc5d01 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 19 Sep 2011 17:14:53 -0400
Subject: [PATCH 088/105] NVMe: Fix memory leak in nvme_dev_add()

The driver was allocating 8k of memory, then freeing 4k of it.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index a7f82fbdaf87..705f66ebd15f 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1457,7 +1457,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 	list_for_each_entry(ns, &dev->namespaces, list)
 		add_disk(ns->disk);
 
-	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	dma_free_coherent(&dev->pci_dev->dev, 8192, id, dma_addr);
 	return 0;
 
  out_free:
@@ -1466,7 +1466,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 		nvme_ns_free(ns);
 	}
 
-	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
 	return res;
 }
 

From bc5fc7e4b22ca855902aba02b28c96f09b446407 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 19 Sep 2011 17:08:14 -0400
Subject: [PATCH 089/105] NVMe: Create nvme_identify and nvme_get_features
 functions

Instead of open-coding calls to nvme_submit_admin_cmd, these
small wrappers are simpler to use (the patch removes 14 lines from
nvme_dev_add() for example).

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 76 +++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 33 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 705f66ebd15f..b77894a75855 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -812,6 +812,34 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
+static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
+							dma_addr_t dma_addr)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.prp1 = cpu_to_le64(dma_addr);
+	c.identify.cns = cpu_to_le32(cns);
+
+	return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
+			unsigned dword11, dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_get_features;
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
+	return nvme_submit_admin_cmd(dev, &c, result);
+}
+
 static void nvme_free_queue(struct nvme_dev *dev, int qid)
 {
 	struct nvme_queue *nvmeq = dev->queues[qid];
@@ -1318,15 +1346,10 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
 	u32 result;
-	struct nvme_command c;
 	u32 q_count = (count - 1) | ((count - 1) << 16);
 
-	memset(&c, 0, sizeof(c));
-	c.features.opcode = nvme_admin_get_features;
-	c.features.fid = cpu_to_le32(NVME_FEAT_NUM_QUEUES);
-	c.features.dword11 = cpu_to_le32(q_count);
-
-	status = nvme_submit_admin_cmd(dev, &c, &result);
+	status = nvme_get_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
+								&result);
 	if (status)
 		return -EIO;
 	return min(result & 0xffff, result >> 16) + 1;
@@ -1400,65 +1423,51 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 	int res, nn, i;
 	struct nvme_ns *ns, *next;
 	struct nvme_id_ctrl *ctrl;
-	void *id;
+	struct nvme_id_ns *id_ns;
+	void *mem;
 	dma_addr_t dma_addr;
-	struct nvme_command cid, crt;
 
 	res = nvme_setup_io_queues(dev);
 	if (res)
 		return res;
 
-	/* XXX: Switch to a SG list once prp2 works */
-	id = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
 								GFP_KERNEL);
 
-	memset(&cid, 0, sizeof(cid));
-	cid.identify.opcode = nvme_admin_identify;
-	cid.identify.nsid = 0;
-	cid.identify.prp1 = cpu_to_le64(dma_addr);
-	cid.identify.cns = cpu_to_le32(1);
-
-	res = nvme_submit_admin_cmd(dev, &cid, NULL);
+	res = nvme_identify(dev, 0, 1, dma_addr);
 	if (res) {
 		res = -EIO;
 		goto out_free;
 	}
 
-	ctrl = id;
+	ctrl = mem;
 	nn = le32_to_cpup(&ctrl->nn);
 	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
 	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
 	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
 
-	cid.identify.cns = 0;
-	memset(&crt, 0, sizeof(crt));
-	crt.features.opcode = nvme_admin_get_features;
-	crt.features.prp1 = cpu_to_le64(dma_addr + 4096);
-	crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
-
+	id_ns = mem;
 	for (i = 0; i <= nn; i++) {
-		cid.identify.nsid = cpu_to_le32(i);
-		res = nvme_submit_admin_cmd(dev, &cid, NULL);
+		res = nvme_identify(dev, i, 0, dma_addr);
 		if (res)
 			continue;
 
-		if (((struct nvme_id_ns *)id)->ncap == 0)
+		if (id_ns->ncap == 0)
 			continue;
 
-		crt.features.nsid = cpu_to_le32(i);
-		res = nvme_submit_admin_cmd(dev, &crt, NULL);
+		res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
+							dma_addr + 4096, NULL);
 		if (res)
 			continue;
 
-		ns = nvme_alloc_ns(dev, i, id, id + 4096);
+		ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
 		if (ns)
 			list_add_tail(&ns->list, &dev->namespaces);
 	}
 	list_for_each_entry(ns, &dev->namespaces, list)
 		add_disk(ns->disk);
 
-	dma_free_coherent(&dev->pci_dev->dev, 8192, id, dma_addr);
-	return 0;
+	goto out;
 
  out_free:
 	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
@@ -1466,6 +1475,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 		nvme_ns_free(ns);
 	}
 
+ out:
 	dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
 	return res;
 }

From 0d1bc9125890426b52ca2de6abedd32e31722e5c Mon Sep 17 00:00:00 2001
From: Nisheeth Bhat <nisheeth.bhat@intel.com>
Date: Thu, 29 Sep 2011 10:10:10 -0400
Subject: [PATCH 090/105] Fix calculation of number of pages in a PRP List

The existing calculation underestimated the number of pages required
as it did not take into account the pointer at the end of each page.
The replacement calculation may overestimate the number of pages required
if the last page in the PRP List is entirely full.  By using ->npages
as a counter as we fill in the pages, we ensure that we don't try to
free a page that was never allocated.

Signed-off-by: Nisheeth Bhat <nisheeth.bhat@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b77894a75855..3afdc750aaa8 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -265,7 +265,7 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 }
 
 struct nvme_prps {
-	int npages;
+	int npages;		/* 0 means small pool in use */
 	dma_addr_t first_dma;
 	__le64 *list[0];
 };
@@ -347,7 +347,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 	int offset = offset_in_page(dma_addr);
 	__le64 *prp_list;
 	dma_addr_t prp_dma;
-	int nprps, npages, i, prp_page;
+	int nprps, npages, i;
 	struct nvme_prps *prps = NULL;
 
 	cmd->prp1 = cpu_to_le64(dma_addr);
@@ -370,20 +370,20 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 	}
 
 	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
-	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE);
+	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
 	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, gfp);
 	if (!prps) {
 		cmd->prp2 = cpu_to_le64(dma_addr);
 		*len = (*len - length) + PAGE_SIZE;
 		return prps;
 	}
-	prp_page = 0;
+
 	if (nprps <= (256 / 8)) {
 		pool = dev->prp_small_pool;
 		prps->npages = 0;
 	} else {
 		pool = dev->prp_page_pool;
-		prps->npages = npages;
+		prps->npages = 1;
 	}
 
 	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
@@ -393,7 +393,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 		kfree(prps);
 		return NULL;
 	}
-	prps->list[prp_page++] = prp_list;
+	prps->list[0] = prp_list;
 	prps->first_dma = prp_dma;
 	cmd->prp2 = cpu_to_le64(prp_dma);
 	i = 0;
@@ -405,7 +405,7 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 				*len = (*len - length);
 				return prps;
 			}
-			prps->list[prp_page++] = prp_list;
+			prps->list[prps->npages++] = prp_list;
 			prp_list[0] = old_prp_list[i - 1];
 			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
 			i = 1;

From 2b2c1896871838cdf549442e8ad0264be5fa74e3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 7 Oct 2011 13:10:13 -0400
Subject: [PATCH 091/105] NVMe: Don't probe namespace 0

ECN 001 documented that namespace 0 is not valid.  Sending an Identify
with CNS of 0 and Namespace of 0 is an undefined command.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 3afdc750aaa8..660aa5dfe569 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1447,7 +1447,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
 
 	id_ns = mem;
-	for (i = 0; i <= nn; i++) {
+	for (i = 1; i <= nn; i++) {
 		res = nvme_identify(dev, i, 0, dma_addr);
 		if (res)
 			continue;

From ce38c149576fd0a3360fec3bef4012212d42e736 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 7 Oct 2011 13:20:37 -0400
Subject: [PATCH 092/105] NVMe: Version 0.7

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 660aa5dfe569..cfe5932821d8 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1700,6 +1700,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.6");
+MODULE_VERSION("0.7");
 module_init(nvme_init);
 module_exit(nvme_exit);

From f1938f6e1ee1583c87ec74dc406fdd8694e99ac8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Thu, 20 Oct 2011 17:00:41 -0400
Subject: [PATCH 093/105] NVMe: Implement doorbell stride capability

The doorbell stride allows devices to spread out their doorbells instead
of packing them tightly.  This feature was added as part of ECN 003.

This patch also enables support for more than 512 queues :-)

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 17 ++++++++++++++---
 include/linux/nvme.h |  1 +
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index cfe5932821d8..a17f80fa3881 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -70,6 +70,7 @@ struct nvme_dev {
 	struct dma_pool *prp_small_pool;
 	int instance;
 	int queue_count;
+	int db_stride;
 	u32 ctrl_config;
 	struct msix_entry *entry;
 	struct nvme_bar __iomem *bar;
@@ -672,7 +673,7 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 	if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
 		return IRQ_NONE;
 
-	writel(head, nvmeq->q_db + 1);
+	writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
 	nvmeq->cq_head = head;
 	nvmeq->cq_phase = phase;
 
@@ -889,7 +890,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	init_waitqueue_head(&nvmeq->sq_full);
 	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
 	bio_list_init(&nvmeq->sq_cong);
-	nvmeq->q_db = &dev->dbs[qid * 2];
+	nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
 	nvmeq->q_depth = depth;
 	nvmeq->cq_vector = vector;
 
@@ -981,6 +982,7 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 
 	cap = readq(&dev->bar->cap);
 	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+	dev->db_stride = NVME_CAP_STRIDE(cap);
 
 	while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
 		msleep(100);
@@ -1357,7 +1359,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 
 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	int result, cpu, i, nr_io_queues;
+	int result, cpu, i, nr_io_queues, db_bar_size;
 
 	nr_io_queues = num_online_cpus();
 	result = set_queue_count(dev, nr_io_queues);
@@ -1369,6 +1371,15 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 	/* Deregister the admin queue's interrupt */
 	free_irq(dev->entry[0].vector, dev->queues[0]);
 
+	db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3));
+	if (db_bar_size > 8192) {
+		iounmap(dev->bar);
+		dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0),
+								db_bar_size);
+		dev->dbs = ((void __iomem *)dev->bar) + 4096;
+		dev->queues[0]->q_db = dev->dbs;
+	}
+
 	for (i = 0; i < nr_io_queues; i++)
 		dev->entry[i].entry = i;
 	for (;;) {
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c96ab0f5ef6f..2a2c535c8345 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -36,6 +36,7 @@ struct nvme_bar {
 };
 
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
+#define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
 
 enum {
 	NVME_CC_ENABLE		= 1 << 0,

From 010e646ba2fdfc558048a97da746381c35836280 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 4 Nov 2011 16:24:23 -0400
Subject: [PATCH 094/105] NVMe: Update Identify Controller data structure

The driver was still using an old definition of Identify Controller
which only came to light once we started using the 'number of namespaces'
field properly.

Reported-by: Nisheeth Bhat <nisheeth.bhat@intel.com>
Reported-by: Khosrow Panah <Khosrow.Panah@idt.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 include/linux/nvme.h | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 2a2c535c8345..9490a00529f4 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -57,6 +57,18 @@ enum {
 	NVME_CSTS_SHST_CMPLT	= 2 << 2,
 };
 
+struct nvme_id_power_state {
+	__le16			max_power;	/* centiwatts */
+	__u16			rsvd2;
+	__le32			entry_lat;	/* microseconds */
+	__le32			exit_lat;	/* microseconds */
+	__u8			read_tput;
+	__u8			read_lat;
+	__u8			write_tput;
+	__u8			write_lat;
+	__u8			rsvd16[16];
+};
+
 #define NVME_VS(major, minor)	(major << 16 | minor)
 
 struct nvme_id_ctrl {
@@ -65,9 +77,11 @@ struct nvme_id_ctrl {
 	char			sn[20];
 	char			mn[40];
 	char			fr[8];
-	__le32			nn;
 	__u8			rab;
-	__u8			rsvd77[178];
+	__u8			ieee[3];
+	__u8			mic;
+	__u8			mdts;
+	__u8			rsvd78[178];
 	__le16			oacs;
 	__u8			acl;
 	__u8			aerl;
@@ -76,15 +90,18 @@ struct nvme_id_ctrl {
 	__u8			elpe;
 	__u8			npss;
 	__u8			rsvd264[248];
-	__le64			psd[32];
+	__u8			sqes;
+	__u8			cqes;
+	__u8			rsvd514[2];
+	__le32			nn;
 	__le16			oncs;
 	__le16			fuses;
 	__u8			fna;
 	__u8			vwc;
 	__le16			awun;
 	__le16			awupf;
-	__u8			rsvd778[246];
-	__u8			cmdset[2048];
+	__u8			rsvd530[1518];
+	struct nvme_id_power_state	psd[32];
 	__u8			vs[1024];
 };
 

From c2f5b65020869215814df03c3941dac9436f99fb Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Sat, 15 Oct 2011 07:33:46 -0400
Subject: [PATCH 095/105] NVMe: Simplify completion handling

Instead of encoding the handler type in the bottom two bits of the
per-completion context pointer, store the handler function as well
as the context pointer.  This gives us more flexibility and the code
is clearer.  It comes at the cost of an extra 8k of memory per queue,
but this feels like a reasonable price to pay.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 167 +++++++++++++++++++++----------------------
 1 file changed, 81 insertions(+), 86 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index a17f80fa3881..4724655a6ebf 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -135,8 +135,12 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 }
 
+typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
+						struct nvme_completion *);
+
 struct nvme_cmd_info {
-	unsigned long ctx;
+	nvme_completion_fn fn;
+	void *ctx;
 	unsigned long timeout;
 };
 
@@ -149,7 +153,7 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
  * alloc_cmdid() - Allocate a Command ID
  * @nvmeq: The queue that will be used for this command
  * @ctx: A pointer that will be passed to the handler
- * @handler: The ID of the handler to call
+ * @handler: The function to call on completion
  *
  * Allocate a Command ID for a queue.  The data passed in will
  * be passed to the completion handler.  This is implemented by using
@@ -160,28 +164,27 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
  * May be called with local interrupts disabled and the q_lock held,
  * or with interrupts enabled and no locks held.
  */
-static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler,
-							unsigned timeout)
+static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
+				nvme_completion_fn handler, unsigned timeout)
 {
 	int depth = nvmeq->q_depth - 1;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 	int cmdid;
 
-	BUG_ON((unsigned long)ctx & 3);
-
 	do {
 		cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
 		if (cmdid >= depth)
 			return -EBUSY;
 	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
 
-	info[cmdid].ctx = (unsigned long)ctx | handler;
+	info[cmdid].fn = handler;
+	info[cmdid].ctx = ctx;
 	info[cmdid].timeout = jiffies + timeout;
 	return cmdid;
 }
 
 static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
-						int handler, unsigned timeout)
+				nvme_completion_fn handler, unsigned timeout)
 {
 	int cmdid;
 	wait_event_killable(nvmeq->sq_full,
@@ -189,47 +192,69 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 	return (cmdid < 0) ? -EINTR : cmdid;
 }
 
-/*
- * If you need more than four handlers, you'll need to change how
- * alloc_cmdid and nvme_process_cq work.  Consider using a special
- * CMD_CTX value instead, if that works for your situation.
- */
-enum {
-	sync_completion_id = 0,
-	bio_completion_id,
-};
-
-/* Special values must be a multiple of 4, and less than 0x1000 */
-#define CMD_CTX_BASE		(POISON_POINTER_DELTA + sync_completion_id)
+/* Special values must be less than 0x1000 */
+#define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
 #define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
 #define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
 
+static void special_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	if (ctx == CMD_CTX_CANCELLED)
+		return;
+	if (ctx == CMD_CTX_FLUSH)
+		return;
+	if (ctx == CMD_CTX_COMPLETED) {
+		dev_warn(nvmeq->q_dmadev,
+				"completed id %d twice on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
+	if (ctx == CMD_CTX_INVALID) {
+		dev_warn(nvmeq->q_dmadev,
+				"invalid id %d completed on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
+
+	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
+}
+
 /*
  * Called with local interrupts disabled and the q_lock held.  May not sleep.
  */
-static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
+static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
+						nvme_completion_fn *fn)
 {
-	unsigned long data;
+	void *ctx;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 
-	if (cmdid >= nvmeq->q_depth)
+	if (cmdid >= nvmeq->q_depth) {
+		*fn = special_completion;
 		return CMD_CTX_INVALID;
-	data = info[cmdid].ctx;
+	}
+	*fn = info[cmdid].fn;
+	ctx = info[cmdid].ctx;
+	info[cmdid].fn = special_completion;
 	info[cmdid].ctx = CMD_CTX_COMPLETED;
 	clear_bit(cmdid, nvmeq->cmdid_data);
 	wake_up(&nvmeq->sq_full);
-	return data;
+	return ctx;
 }
 
-static unsigned long cancel_cmdid(struct nvme_queue *nvmeq, int cmdid)
+static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
+						nvme_completion_fn *fn)
 {
-	unsigned long data;
+	void *ctx;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	data = info[cmdid].ctx;
+	if (fn)
+		*fn = info[cmdid].fn;
+	ctx = info[cmdid].ctx;
+	info[cmdid].fn = special_completion;
 	info[cmdid].ctx = CMD_CTX_CANCELLED;
-	return data;
+	return ctx;
 }
 
 static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
@@ -485,7 +510,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
 {
 	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
-						sync_completion_id, IO_TIMEOUT);
+						special_completion, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		return cmdid;
 
@@ -518,7 +543,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	nbio->bio = bio;
 
 	result = -EBUSY;
-	cmdid = alloc_cmdid(nvmeq, nbio, bio_completion_id, IO_TIMEOUT);
+	cmdid = alloc_cmdid(nvmeq, nbio, bio_completion, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		goto free_nbio;
 
@@ -599,45 +624,6 @@ static int nvme_make_request(struct request_queue *q, struct bio *bio)
 	return 0;
 }
 
-struct sync_cmd_info {
-	struct task_struct *task;
-	u32 result;
-	int status;
-};
-
-static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct sync_cmd_info *cmdinfo = ctx;
-	if (unlikely((unsigned long)cmdinfo == CMD_CTX_CANCELLED))
-		return;
-	if ((unsigned long)cmdinfo == CMD_CTX_FLUSH)
-		return;
-	if (unlikely((unsigned long)cmdinfo == CMD_CTX_COMPLETED)) {
-		dev_warn(nvmeq->q_dmadev,
-				"completed id %d twice on queue %d\n",
-				cqe->command_id, le16_to_cpup(&cqe->sq_id));
-		return;
-	}
-	if (unlikely((unsigned long)cmdinfo == CMD_CTX_INVALID)) {
-		dev_warn(nvmeq->q_dmadev,
-				"invalid id %d completed on queue %d\n",
-				cqe->command_id, le16_to_cpup(&cqe->sq_id));
-		return;
-	}
-	cmdinfo->result = le32_to_cpup(&cqe->result);
-	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
-	wake_up_process(cmdinfo->task);
-}
-
-typedef void (*completion_fn)(struct nvme_queue *, void *,
-						struct nvme_completion *);
-
-static const completion_fn nvme_completions[4] = {
-	[sync_completion_id] = sync_completion,
-	[bio_completion_id]  = bio_completion,
-};
-
 static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 {
 	u16 head, phase;
@@ -646,9 +632,8 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 	phase = nvmeq->cq_phase;
 
 	for (;;) {
-		unsigned long data;
-		void *ptr;
-		unsigned char handler;
+		void *ctx;
+		nvme_completion_fn fn;
 		struct nvme_completion cqe = nvmeq->cqes[head];
 		if ((le16_to_cpu(cqe.status) & 1) != phase)
 			break;
@@ -658,10 +643,8 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 			phase = !phase;
 		}
 
-		data = free_cmdid(nvmeq, cqe.command_id);
-		handler = data & 3;
-		ptr = (void *)(data & ~3UL);
-		nvme_completions[handler](nvmeq, ptr, &cqe);
+		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
+		fn(nvmeq, ctx, &cqe);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -702,10 +685,25 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 {
 	spin_lock_irq(&nvmeq->q_lock);
-	cancel_cmdid(nvmeq, cmdid);
+	cancel_cmdid(nvmeq, cmdid, NULL);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
+struct sync_cmd_info {
+	struct task_struct *task;
+	u32 result;
+	int status;
+};
+
+static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct sync_cmd_info *cmdinfo = ctx;
+	cmdinfo->result = le32_to_cpup(&cqe->result);
+	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+	wake_up_process(cmdinfo->task);
+}
+
 /*
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
@@ -719,7 +717,7 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
-	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion_id,
+	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion,
 								timeout);
 	if (cmdid < 0)
 		return cmdid;
@@ -1201,18 +1199,15 @@ static void nvme_timeout_ios(struct nvme_queue *nvmeq)
 	int cmdid;
 
 	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
-		unsigned long data;
-		void *ptr;
-		unsigned char handler;
+		void *ctx;
+		nvme_completion_fn fn;
 		static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
 
 		if (!time_after(now, info[cmdid].timeout))
 			continue;
 		dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
-		data = cancel_cmdid(nvmeq, cmdid);
-		handler = data & 3;
-		ptr = (void *)(data & ~3UL);
-		nvme_completions[handler](nvmeq, ptr, &cqe);
+		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
+		fn(nvmeq, ctx, &cqe);
 	}
 }
 

From 040a93b52a9eee8177ebaf2ba0ee0f9f518d1bf8 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 20 Dec 2011 11:04:12 -0500
Subject: [PATCH 096/105] NVMe: Change get_nvmeq to take a dev instead of a
 namespace

Upcoming patches require calling get_nvmeq when we don't have a namespace.
Some callers already have the device in a local variable anyway.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 4724655a6ebf..aa2fd66aabd6 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -257,9 +257,9 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 	return ctx;
 }
 
-static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
+static struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
 {
-	return ns->dev->queues[get_cpu() + 1];
+	return dev->queues[get_cpu() + 1];
 }
 
 static void put_nvmeq(struct nvme_queue *nvmeq)
@@ -606,7 +606,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct nvme_ns *ns = q->queuedata;
-	struct nvme_queue *nvmeq = get_nvmeq(ns);
+	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
 	int result = -EBUSY;
 
 	spin_lock_irq(&nvmeq->q_lock);
@@ -1103,7 +1103,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	/* XXX: metadata */
 	prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
 
-	nvmeq = get_nvmeq(ns);
+	nvmeq = get_nvmeq(dev);
 	/*
 	 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
 	 * disabled.  We may be preempted at any point, and be rescheduled

From 5c1281a3bf5655ec1b90db495da3a2b77826ba88 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 20 Dec 2011 11:54:53 -0500
Subject: [PATCH 097/105] NVMe: Change nvme_completion_fn to take a dev

The queue is only needed for some rare occasions, and it's more consistent
to pass the device around.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index aa2fd66aabd6..b0e8a6dd33b1 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -135,7 +135,7 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 }
 
-typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
+typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
 						struct nvme_completion *);
 
 struct nvme_cmd_info {
@@ -199,7 +199,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
 #define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
 
-static void special_completion(struct nvme_queue *nvmeq, void *ctx,
+static void special_completion(struct nvme_dev *dev, void *ctx,
 						struct nvme_completion *cqe)
 {
 	if (ctx == CMD_CTX_CANCELLED)
@@ -207,19 +207,19 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx,
 	if (ctx == CMD_CTX_FLUSH)
 		return;
 	if (ctx == CMD_CTX_COMPLETED) {
-		dev_warn(nvmeq->q_dmadev,
+		dev_warn(&dev->pci_dev->dev,
 				"completed id %d twice on queue %d\n",
 				cqe->command_id, le16_to_cpup(&cqe->sq_id));
 		return;
 	}
 	if (ctx == CMD_CTX_INVALID) {
-		dev_warn(nvmeq->q_dmadev,
+		dev_warn(&dev->pci_dev->dev,
 				"invalid id %d completed on queue %d\n",
 				cqe->command_id, le16_to_cpup(&cqe->sq_id));
 		return;
 	}
 
-	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
+	dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
 }
 
 /*
@@ -332,29 +332,36 @@ static struct nvme_bio *alloc_nbio(unsigned nseg, gfp_t gfp)
 			sizeof(struct scatterlist) * nseg, gfp);
 }
 
-static void free_nbio(struct nvme_queue *nvmeq, struct nvme_bio *nbio)
+static void free_nbio(struct nvme_dev *dev, struct nvme_bio *nbio)
 {
-	nvme_free_prps(nvmeq->dev, nbio->prps);
+	nvme_free_prps(dev, nbio->prps);
 	kfree(nbio);
 }
 
-static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
+static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
+{
+	struct nvme_queue *nvmeq = get_nvmeq(dev);
+	if (bio_list_empty(&nvmeq->sq_cong))
+		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+	bio_list_add(&nvmeq->sq_cong, bio);
+	put_nvmeq(nvmeq);
+	wake_up_process(nvme_thread);
+}
+
+static void bio_completion(struct nvme_dev *dev, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_bio *nbio = ctx;
 	struct bio *bio = nbio->bio;
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
-	dma_unmap_sg(nvmeq->q_dmadev, nbio->sg, nbio->nents,
+	dma_unmap_sg(&dev->pci_dev->dev, nbio->sg, nbio->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	free_nbio(nvmeq, nbio);
+	free_nbio(dev, nbio);
 	if (status) {
 		bio_endio(bio, -EIO);
 	} else if (bio->bi_vcnt > bio->bi_idx) {
-		if (bio_list_empty(&nvmeq->sq_cong))
-			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-		bio_list_add(&nvmeq->sq_cong, bio);
-		wake_up_process(nvme_thread);
+		requeue_bio(dev, bio);
 	} else {
 		bio_endio(bio, 0);
 	}
@@ -594,7 +601,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return 0;
 
  free_nbio:
-	free_nbio(nvmeq, nbio);
+	free_nbio(nvmeq->dev, nbio);
  nomem:
 	return result;
 }
@@ -644,7 +651,7 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
 		}
 
 		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
-		fn(nvmeq, ctx, &cqe);
+		fn(nvmeq->dev, ctx, &cqe);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -695,7 +702,7 @@ struct sync_cmd_info {
 	int status;
 };
 
-static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
+static void sync_completion(struct nvme_dev *dev, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct sync_cmd_info *cmdinfo = ctx;
@@ -1207,7 +1214,7 @@ static void nvme_timeout_ios(struct nvme_queue *nvmeq)
 			continue;
 		dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
 		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
-		fn(nvmeq, ctx, &cqe);
+		fn(nvmeq->dev, ctx, &cqe);
 	}
 }
 

From eca18b2394a9387feeaf14cd884ddddd7a809d19 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 20 Dec 2011 13:34:52 -0500
Subject: [PATCH 098/105] NVMe: Merge the nvme_bio and nvme_prp data structures

The new merged data structure is called nvme_iod.  This improves performance
for mid-sized I/Os (in the 16k range) since we save a memory allocation.
It is also a slightly simpler interface to use.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 241 ++++++++++++++++++++++---------------------
 1 file changed, 125 insertions(+), 116 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b0e8a6dd33b1..4517608c068f 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -290,52 +290,70 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	return 0;
 }
 
-struct nvme_prps {
-	int npages;		/* 0 means small pool in use */
+/*
+ * The nvme_iod describes the data in an I/O, including the list of PRP
+ * entries.  You can't see it in this data structure because C doesn't let
+ * me express that.  Use nvme_alloc_iod to ensure there's enough space
+ * allocated to store the PRP list.
+ */
+struct nvme_iod {
+	void *private;		/* For the use of the submitter of the I/O */
+	int npages;		/* In the PRP list. 0 means small pool in use */
+	int offset;		/* Of PRP list */
+	int nents;		/* Used in scatterlist */
+	int length;		/* Of data, in bytes */
 	dma_addr_t first_dma;
-	__le64 *list[0];
+	struct scatterlist sg[0];
 };
 
-static void nvme_free_prps(struct nvme_dev *dev, struct nvme_prps *prps)
+static __le64 **iod_list(struct nvme_iod *iod)
+{
+	return ((void *)iod) + iod->offset;
+}
+
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size)
+{
+	unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static struct nvme_iod *
+nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
+{
+	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
+				sizeof(__le64 *) * nvme_npages(nbytes) +
+				sizeof(struct scatterlist) * nseg, gfp);
+
+	if (iod) {
+		iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+		iod->npages = -1;
+		iod->length = nbytes;
+	}
+
+	return iod;
+}
+
+static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
 	const int last_prp = PAGE_SIZE / 8 - 1;
 	int i;
-	dma_addr_t prp_dma;
+	__le64 **list = iod_list(iod);
+	dma_addr_t prp_dma = iod->first_dma;
 
-	if (!prps)
-		return;
-
-	prp_dma = prps->first_dma;
-
-	if (prps->npages == 0)
-		dma_pool_free(dev->prp_small_pool, prps->list[0], prp_dma);
-	for (i = 0; i < prps->npages; i++) {
-		__le64 *prp_list = prps->list[i];
+	if (iod->npages == 0)
+		dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
+	for (i = 0; i < iod->npages; i++) {
+		__le64 *prp_list = list[i];
 		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
 		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
 		prp_dma = next_prp_dma;
 	}
-	kfree(prps);
-}
-
-struct nvme_bio {
-	struct bio *bio;
-	int nents;
-	struct nvme_prps *prps;
-	struct scatterlist sg[0];
-};
-
-/* XXX: use a mempool */
-static struct nvme_bio *alloc_nbio(unsigned nseg, gfp_t gfp)
-{
-	return kzalloc(sizeof(struct nvme_bio) +
-			sizeof(struct scatterlist) * nseg, gfp);
-}
-
-static void free_nbio(struct nvme_dev *dev, struct nvme_bio *nbio)
-{
-	nvme_free_prps(dev, nbio->prps);
-	kfree(nbio);
+	kfree(iod);
 }
 
 static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
@@ -351,13 +369,13 @@ static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
 static void bio_completion(struct nvme_dev *dev, void *ctx,
 						struct nvme_completion *cqe)
 {
-	struct nvme_bio *nbio = ctx;
-	struct bio *bio = nbio->bio;
+	struct nvme_iod *iod = ctx;
+	struct bio *bio = iod->private;
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 
-	dma_unmap_sg(&dev->pci_dev->dev, nbio->sg, nbio->nents,
+	dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	free_nbio(dev, nbio);
+	nvme_free_iod(dev, iod);
 	if (status) {
 		bio_endio(bio, -EIO);
 	} else if (bio->bi_vcnt > bio->bi_idx) {
@@ -368,25 +386,25 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
-					struct nvme_common_command *cmd,
-					struct scatterlist *sg, int *len,
-					gfp_t gfp)
+static int nvme_setup_prps(struct nvme_dev *dev,
+			struct nvme_common_command *cmd, struct nvme_iod *iod,
+			int total_len, gfp_t gfp)
 {
 	struct dma_pool *pool;
-	int length = *len;
+	int length = total_len;
+	struct scatterlist *sg = iod->sg;
 	int dma_len = sg_dma_len(sg);
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = offset_in_page(dma_addr);
 	__le64 *prp_list;
+	__le64 **list = iod_list(iod);
 	dma_addr_t prp_dma;
-	int nprps, npages, i;
-	struct nvme_prps *prps = NULL;
+	int nprps, i;
 
 	cmd->prp1 = cpu_to_le64(dma_addr);
 	length -= (PAGE_SIZE - offset);
 	if (length <= 0)
-		return prps;
+		return total_len;
 
 	dma_len -= (PAGE_SIZE - offset);
 	if (dma_len) {
@@ -399,46 +417,35 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 
 	if (length <= PAGE_SIZE) {
 		cmd->prp2 = cpu_to_le64(dma_addr);
-		return prps;
+		return total_len;
 	}
 
 	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
-	npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
-	prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, gfp);
-	if (!prps) {
-		cmd->prp2 = cpu_to_le64(dma_addr);
-		*len = (*len - length) + PAGE_SIZE;
-		return prps;
-	}
-
 	if (nprps <= (256 / 8)) {
 		pool = dev->prp_small_pool;
-		prps->npages = 0;
+		iod->npages = 0;
 	} else {
 		pool = dev->prp_page_pool;
-		prps->npages = 1;
+		iod->npages = 1;
 	}
 
 	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
 	if (!prp_list) {
 		cmd->prp2 = cpu_to_le64(dma_addr);
-		*len = (*len - length) + PAGE_SIZE;
-		kfree(prps);
-		return NULL;
+		iod->npages = -1;
+		return (total_len - length) + PAGE_SIZE;
 	}
-	prps->list[0] = prp_list;
-	prps->first_dma = prp_dma;
+	list[0] = prp_list;
+	iod->first_dma = prp_dma;
 	cmd->prp2 = cpu_to_le64(prp_dma);
 	i = 0;
 	for (;;) {
 		if (i == PAGE_SIZE / 8) {
 			__le64 *old_prp_list = prp_list;
 			prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
-			if (!prp_list) {
-				*len = (*len - length);
-				return prps;
-			}
-			prps->list[prps->npages++] = prp_list;
+			if (!prp_list)
+				return total_len - length;
+			list[iod->npages++] = prp_list;
 			prp_list[0] = old_prp_list[i - 1];
 			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
 			i = 1;
@@ -457,21 +464,21 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 		dma_len = sg_dma_len(sg);
 	}
 
-	return prps;
+	return total_len;
 }
 
 /* NVMe scatterlists require no holes in the virtual address */
 #define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)	((vec2)->bv_offset || \
 			(((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
 
-static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
+static int nvme_map_bio(struct device *dev, struct nvme_iod *iod,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
 	struct bio_vec *bvec, *bvprv = NULL;
 	struct scatterlist *sg = NULL;
 	int i, old_idx, length = 0, nsegs = 0;
 
-	sg_init_table(nbio->sg, psegs);
+	sg_init_table(iod->sg, psegs);
 	old_idx = bio->bi_idx;
 	bio_for_each_segment(bvec, bio, i) {
 		if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
@@ -479,7 +486,7 @@ static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 		} else {
 			if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
 				break;
-			sg = sg ? sg + 1 : nbio->sg;
+			sg = sg ? sg + 1 : iod->sg;
 			sg_set_page(sg, bvec->bv_page, bvec->bv_len,
 							bvec->bv_offset);
 			nsegs++;
@@ -488,9 +495,9 @@ static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
 		bvprv = bvec;
 	}
 	bio->bi_idx = i;
-	nbio->nents = nsegs;
+	iod->nents = nsegs;
 	sg_mark_end(sg);
-	if (dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir) == 0) {
+	if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) {
 		bio->bi_idx = old_idx;
 		return -ENOMEM;
 	}
@@ -531,7 +538,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								struct bio *bio)
 {
 	struct nvme_command *cmnd;
-	struct nvme_bio *nbio;
+	struct nvme_iod *iod;
 	enum dma_data_direction dma_dir;
 	int cmdid, length, result = -ENOMEM;
 	u16 control;
@@ -544,15 +551,15 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 			return result;
 	}
 
-	nbio = alloc_nbio(psegs, GFP_ATOMIC);
-	if (!nbio)
+	iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC);
+	if (!iod)
 		goto nomem;
-	nbio->bio = bio;
+	iod->private = bio;
 
 	result = -EBUSY;
-	cmdid = alloc_cmdid(nvmeq, nbio, bio_completion, IO_TIMEOUT);
+	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
-		goto free_nbio;
+		goto free_iod;
 
 	if ((bio->bi_rw & REQ_FLUSH) && !psegs)
 		return nvme_submit_flush(nvmeq, ns, cmdid);
@@ -578,15 +585,15 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 		dma_dir = DMA_FROM_DEVICE;
 	}
 
-	result = nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs);
+	result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs);
 	if (result < 0)
-		goto free_nbio;
+		goto free_iod;
 	length = result;
 
 	cmnd->rw.command_id = cmdid;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-	nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,
-							&length, GFP_ATOMIC);
+	length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
+								GFP_ATOMIC);
 	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
 	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
@@ -600,8 +607,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
 	return 0;
 
- free_nbio:
-	free_nbio(nvmeq->dev, nbio);
+ free_iod:
+	nvme_free_iod(nvmeq->dev, iod);
  nomem:
 	return result;
 }
@@ -1005,18 +1012,18 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static int nvme_map_user_pages(struct nvme_dev *dev, int write,
-				unsigned long addr, unsigned length,
-				struct scatterlist **sgp)
+static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
+				unsigned long addr, unsigned length)
 {
 	int i, err, count, nents, offset;
 	struct scatterlist *sg;
 	struct page **pages;
+	struct nvme_iod *iod;
 
 	if (addr & 3)
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 	if (!length)
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 
 	offset = offset_in_page(addr);
 	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
@@ -1029,7 +1036,8 @@ static int nvme_map_user_pages(struct nvme_dev *dev, int write,
 		goto put_pages;
 	}
 
-	sg = kcalloc(count, sizeof(*sg), GFP_KERNEL);
+	iod = nvme_alloc_iod(count, length, GFP_KERNEL);
+	sg = iod->sg;
 	sg_init_table(sg, count);
 	for (i = 0; i < count; i++) {
 		sg_set_page(&sg[i], pages[i],
@@ -1042,22 +1050,24 @@ static int nvme_map_user_pages(struct nvme_dev *dev, int write,
 	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
 				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	if (!nents)
-		goto put_pages;
+		goto free_iod;
 
 	kfree(pages);
-	*sgp = sg;
-	return nents;
+	return iod;
 
+ free_iod:
+	kfree(iod);
  put_pages:
 	for (i = 0; i < count; i++)
 		put_page(pages[i]);
 	kfree(pages);
-	return err;
+	return ERR_PTR(err);
 }
 
 static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-			unsigned long addr, int length, struct scatterlist *sg)
+			unsigned long addr, int length, struct nvme_iod *iod)
 {
+	struct scatterlist *sg = iod->sg;
 	int i, count;
 
 	count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE);
@@ -1074,9 +1084,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	struct nvme_user_io io;
 	struct nvme_command c;
 	unsigned length;
-	int nents, status;
-	struct scatterlist *sg;
-	struct nvme_prps *prps;
+	int status;
+	struct nvme_iod *iod;
 
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
@@ -1086,15 +1095,14 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	case nvme_cmd_write:
 	case nvme_cmd_read:
 	case nvme_cmd_compare:
-		nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr,
-								length, &sg);
+		iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	if (nents < 0)
-		return nents;
+	if (IS_ERR(iod))
+		return PTR_ERR(iod);
 
 	memset(&c, 0, sizeof(c));
 	c.rw.opcode = io.opcode;
@@ -1108,7 +1116,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.apptag = io.apptag;
 	c.rw.appmask = io.appmask;
 	/* XXX: metadata */
-	prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
+	length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
 
 	nvmeq = get_nvmeq(dev);
 	/*
@@ -1123,8 +1131,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	else
 		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
 
-	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg);
-	nvme_free_prps(dev, prps);
+	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, iod);
+	nvme_free_iod(dev, iod);
 	return status;
 }
 
@@ -1134,9 +1142,8 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_admin_cmd cmd;
 	struct nvme_command c;
-	int status, length, nents = 0;
-	struct scatterlist *sg;
-	struct nvme_prps *prps = NULL;
+	int status, length;
+	struct nvme_iod *iod;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -1158,19 +1165,21 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 
 	length = cmd.data_len;
 	if (cmd.data_len) {
-		nents = nvme_map_user_pages(dev, 1, cmd.addr, length, &sg);
-		if (nents < 0)
-			return nents;
-		prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
+		iod = nvme_map_user_pages(dev, 1, cmd.addr, length);
+		if (IS_ERR(iod))
+			return PTR_ERR(iod);
+		length = nvme_setup_prps(dev, &c.common, iod, length,
+								GFP_KERNEL);
 	}
 
 	if (length != cmd.data_len)
 		status = -ENOMEM;
 	else
 		status = nvme_submit_admin_cmd(dev, &c, NULL);
+
 	if (cmd.data_len) {
-		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, sg);
-		nvme_free_prps(dev, prps);
+		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, iod);
+		nvme_free_iod(dev, iod);
 	}
 	return status;
 }

From ff976d724a74e4522e9ca2de1fb37ac4520f454f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 20 Dec 2011 13:53:01 -0500
Subject: [PATCH 099/105] NVMe: Rename IO_TIMEOUT to NVME_IO_TIMEOUT

IO_TIMEOUT is a little too generic and might be used by other parts of
the kernel in the future.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 4517608c068f..1cc01872f6dc 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -45,7 +45,7 @@
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 #define NVME_MINORS 64
-#define IO_TIMEOUT	(5 * HZ)
+#define NVME_IO_TIMEOUT	(5 * HZ)
 #define ADMIN_TIMEOUT	(60 * HZ)
 
 static int nvme_major;
@@ -524,7 +524,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
 {
 	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
-						special_completion, IO_TIMEOUT);
+					special_completion, NVME_IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		return cmdid;
 
@@ -557,7 +557,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	iod->private = bio;
 
 	result = -EBUSY;
-	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, IO_TIMEOUT);
+	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
 	if (unlikely(cmdid < 0))
 		goto free_iod;
 
@@ -1129,7 +1129,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	if (length != (io.nblocks + 1) << ns->lba_shift)
 		status = -ENOMEM;
 	else
-		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
+		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
 
 	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, iod);
 	nvme_free_iod(dev, iod);

From 497421880acecd0281d3182d534f3d28c927caec Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 6 Jan 2012 13:42:45 -0700
Subject: [PATCH 100/105] NVMe: Fix DMA mapping for admin commands

We were always mapping as DMA_FROM_DEVICE then unmapping with
DMA_TO_DEVICE which was clearly not correct.  Follow the same pattern as
nvme_submit_io() and key off the bottom bit of the opcode to determine
whether this is a read or a write.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 1cc01872f6dc..3f8cae9dc960 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1165,7 +1165,8 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 
 	length = cmd.data_len;
 	if (cmd.data_len) {
-		iod = nvme_map_user_pages(dev, 1, cmd.addr, length);
+		iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
+								length);
 		if (IS_ERR(iod))
 			return PTR_ERR(iod);
 		length = nvme_setup_prps(dev, &c.common, iod, length,
@@ -1178,7 +1179,8 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 		status = nvme_submit_admin_cmd(dev, &c, NULL);
 
 	if (cmd.data_len) {
-		nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, iod);
+		nvme_unmap_user_pages(dev, cmd.opcode & 1, cmd.addr,
+							cmd.data_len, iod);
 		nvme_free_iod(dev, iod);
 	}
 	return status;

From fe304c43c6d63e29ed4fc46a874d7a74313788c5 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 6 Jan 2012 13:49:25 -0700
Subject: [PATCH 101/105] NVMe: Mark the end of the sg list

For user I/O and admin commands, we were forgetting to mark the end of
the SG list.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 3f8cae9dc960..71fc9030b4df 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1045,6 +1045,7 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 		length -= (PAGE_SIZE - offset);
 		offset = 0;
 	}
+	sg_mark_end(&sg[i - 1]);
 
 	err = -ENOMEM;
 	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,

From 1c2ad9faaf662b4a525348775deca3ac8e6c35a0 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 6 Jan 2012 13:52:56 -0700
Subject: [PATCH 102/105] NVMe: Simplify nvme_unmap_user_pages

By using the iod->nents field (the same way other I/O paths do), we can
avoid recalculating the number of sg entries at unmap time, and make
nvme_unmap_user_pages() easier to call.

Also, use the 'write' parameter instead of assuming DMA_FROM_DEVICE.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 71fc9030b4df..3cf82c27a544 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1046,6 +1046,7 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 		offset = 0;
 	}
 	sg_mark_end(&sg[i - 1]);
+	iod->nents = count;
 
 	err = -ENOMEM;
 	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
@@ -1066,16 +1067,15 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 }
 
 static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-			unsigned long addr, int length, struct nvme_iod *iod)
+			struct nvme_iod *iod)
 {
-	struct scatterlist *sg = iod->sg;
-	int i, count;
+	int i;
 
-	count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE);
-	dma_unmap_sg(&dev->pci_dev->dev, sg, count, DMA_FROM_DEVICE);
+	dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
-	for (i = 0; i < count; i++)
-		put_page(sg_page(&sg[i]));
+	for (i = 0; i < iod->nents; i++)
+		put_page(sg_page(&iod->sg[i]));
 }
 
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
@@ -1132,7 +1132,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	else
 		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
 
-	nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, iod);
+	nvme_unmap_user_pages(dev, io.opcode & 1, iod);
 	nvme_free_iod(dev, iod);
 	return status;
 }
@@ -1180,8 +1180,7 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 		status = nvme_submit_admin_cmd(dev, &c, NULL);
 
 	if (cmd.data_len) {
-		nvme_unmap_user_pages(dev, cmd.opcode & 1, cmd.addr,
-							cmd.data_len, iod);
+		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
 		nvme_free_iod(dev, iod);
 	}
 	return status;

From 4eeb9215a0d5c9494ca8b20158cc8ee82618840c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 10 Jan 2012 14:35:08 -0700
Subject: [PATCH 103/105] NVMe: Set queue flags correctly

QUEUE_FLAG_* are flags (other than QUEUE_FLAG_DEFAULT), so they cannot
be ORed together.  Set the queue flags using queue_flag_set_unlocked().

Reported-by: Donald Wood <donald.e.wood@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 3cf82c27a544..b583603fae5b 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1315,8 +1315,10 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 	ns->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!ns->queue)
 		goto out_free_ns;
-	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT | QUEUE_FLAG_NOMERGES |
-				QUEUE_FLAG_NONROT | QUEUE_FLAG_DISCARD;
+	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+/*	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */
 	blk_queue_make_request(ns->queue, nvme_make_request);
 	ns->dev = dev;
 	ns->queue->queuedata = ns;

From 366e8217e5ec6ce9f73aec19c46d983110fb4a98 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 10 Jan 2012 16:30:15 -0500
Subject: [PATCH 104/105] NVMe: Version 0.8

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index b583603fae5b..28c84b18712d 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1726,6 +1726,6 @@ static void __exit nvme_exit(void)
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.7");
+MODULE_VERSION("0.8");
 module_init(nvme_init);
 module_exit(nvme_exit);

From df3481399042200792822b6243e36a95a557b57e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 11 Jan 2012 07:29:56 -0700
Subject: [PATCH 105/105] NVMe: Set number of queues correctly

The number of submission & completion queues should be set by calling
Set Features, not Get Features.

Reported-by: Kwok Kong <Kwok.Kong@idt.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 28c84b18712d..f4996b0e4b1a 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -840,7 +840,7 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
 }
 
 static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
-			unsigned dword11, dma_addr_t dma_addr, u32 *result)
+				unsigned dword11, dma_addr_t dma_addr)
 {
 	struct nvme_command c;
 
@@ -850,6 +850,20 @@ static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
 	c.features.fid = cpu_to_le32(fid);
 	c.features.dword11 = cpu_to_le32(dword11);
 
+	return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
+			unsigned dword11, dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_set_features;
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
 	return nvme_submit_admin_cmd(dev, &c, result);
 }
 
@@ -1365,7 +1379,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 	u32 result;
 	u32 q_count = (count - 1) | ((count - 1) << 16);
 
-	status = nvme_get_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
+	status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
 								&result);
 	if (status)
 		return -EIO;
@@ -1482,7 +1496,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 			continue;
 
 		res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
-							dma_addr + 4096, NULL);
+							dma_addr + 4096);
 		if (res)
 			continue;