Merge branch 'for-3.16/core' of git://git.kernel.dk/linux-block into next

Pull block core updates from Jens Axboe: "It's a big(ish) round this time, lots of development effort has gone into blk-mq in the last 3 months. Generally we're heading to where 3.16 will be a feature complete and performant blk-mq. scsi-mq is progressing nicely and will hopefully be in 3.17. A nvme port is in progress, and the Micron pci-e flash driver, mtip32xx, is converted and will be sent in with the driver pull request for 3.16. This pull request contains: - Lots of prep and support patches for scsi-mq have been integrated. All from Christoph. - API and code cleanups for blk-mq from Christoph. - Lots of good corner case and error handling cleanup fixes for blk-mq from Ming Lei. - A flew of blk-mq updates from me: * Provide strict mappings so that the driver can rely on the CPU to queue mapping. This enables optimizations in the driver. * Provided a bitmap tagging instead of percpu_ida, which never really worked well for blk-mq. percpu_ida relies on the fact that we have a lot more tags available than we really need, it fails miserably for cases where we exhaust (or are close to exhausting) the tag space. * Provide sane support for shared tag maps, as utilized by scsi-mq * Various fixes for IO timeouts. * API cleanups, and lots of perf tweaks and optimizations. - Remove 'buffer' from struct request. This is ancient code, from when requests were always virtually mapped. Kill it, to reclaim some space in struct request. From me. - Remove 'magic' from blk_plug. Since we store these on the stack and since we've never caught any actual bugs with this, lets just get rid of it. From me. - Only call part_in_flight() once for IO completion, as includes two atomic reads. Hopefully we'll get a better implementation soon, as the part IO stats are now one of the more expensive parts of doing IO on blk-mq. From me. - File migration of block code from {mm,fs}/ to block/. This includes bio.c, bio-integrity.c, bounce.c, and ioprio.c. From me, from a discussion on lkml. That should describe the meat of the pull request. Also has various little fixes and cleanups from Dave Jones, Shaohua Li, Duan Jiong, Fengguang Wu, Fabian Frederick, Randy Dunlap, Robert Elliott, and Sam Bradshaw" * 'for-3.16/core' of git://git.kernel.dk/linux-block: (100 commits) blk-mq: push IPI or local end_io decision to __blk_mq_complete_request() blk-mq: remember to start timeout handler for direct queue block: ensure that the timer is always added blk-mq: blk_mq_unregister_hctx() can be static blk-mq: make the sysfs mq/ layout reflect current mappings blk-mq: blk_mq_tag_to_rq should handle flush request block: remove dead code in scsi_ioctl:blk_verify_command blk-mq: request initialization optimizations block: add queue flag for disabling SG merging block: remove 'magic' from struct blk_plug blk-mq: remove alloc_hctx and free_hctx methods blk-mq: add file comments and update copyright notices blk-mq: remove blk_mq_alloc_request_pinned blk-mq: do not use blk_mq_alloc_request_pinned in blk_mq_map_request blk-mq: remove blk_mq_wait_for_tags blk-mq: initialize request in __blk_mq_alloc_request blk-mq: merge blk_mq_alloc_reserved_request into blk_mq_alloc_request blk-mq: add helper to insert requests from irq context blk-mq: remove stale comment for blk_mq_complete_request() blk-mq: allow non-softirq completions ...
2024-11-23 04:31:50 +00:00 · 2014-06-02 09:29:34 -07:00 · 2014-06-02 09:29:34 -07:00 · 681a289548
commit 681a289548
parent 6c52486ded ed851860b4
56 changed files with 2109 additions and 1007 deletions
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@ -62,7 +62,7 @@
 !Efs/mpage.c
 !Efs/namei.c
 !Efs/buffer.c
-!Efs/bio.c
+!Eblock/bio.c
 !Efs/seq_file.c
 !Efs/filesystems.c
 !Efs/fs-writeback.c
--- a/block/Makefile
+++ b/block/Makefile
@ -2,13 +2,15 @@
 # Makefile for the kernel block layer
 #

-obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
 			blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
 			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
-			genhd.o scsi_ioctl.o partition-generic.o partitions/
+			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+			partitions/

+obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
@ -20,3 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@ -617,7 +617,7 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
 	if (!bs->bio_integrity_pool)
 		return -1;

-	bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
+	bs->bvec_integrity_pool = biovec_create_pool(pool_size);
 	if (!bs->bvec_integrity_pool) {
 		mempool_destroy(bs->bio_integrity_pool);
 		return -1;
--- a/block/bio.c
+++ b/block/bio.c
@ -305,6 +305,8 @@ static void bio_chain_endio(struct bio *bio, int error)

 /**
 * bio_chain - chain bio completions
+ * @bio: the target bio
+ * @parent: the @bio's parent bio
 *
 * The caller won't have a bi_end_io called when @bio completes - instead,
 * @parent's bi_end_io won't be called until both @parent and @bio have
@ -1011,8 +1013,7 @@ static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
 	bio->bi_private = bmd;
 }

-static struct bio_map_data *bio_alloc_map_data(int nr_segs,
-					       unsigned int iov_count,
+static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
 					       gfp_t gfp_mask)
 {
 	if (iov_count > UIO_MAXIOV)
@ -1154,7 +1155,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	if (offset)
 		nr_pages++;

-	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
+	bmd = bio_alloc_map_data(iov_count, gfp_mask);
 	if (!bmd)
 		return ERR_PTR(-ENOMEM);

@ -1859,7 +1860,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
 * create memory pools for biovec's in a bio_set.
 * use the global biovec slabs created for general use.
 */
-mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
+mempool_t *biovec_create_pool(int pool_entries)
 {
 	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;

@ -1922,7 +1923,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;

-	bs->bvec_pool = biovec_create_pool(bs, pool_size);
+	bs->bvec_pool = biovec_create_pool(pool_size);
 	if (!bs->bvec_pool)
 		goto bad;

--- a/block/blk-core.c
+++ b/block/blk-core.c
@ -146,8 +146,8 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 	       (unsigned long long)blk_rq_pos(rq),
 	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
-	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
-	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
+	printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
+	       rq->bio, rq->biotail, blk_rq_bytes(rq));

 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		printk(KERN_INFO "  cdb: ");
@ -251,8 +251,10 @@ void blk_sync_queue(struct request_queue *q)
 		struct blk_mq_hw_ctx *hctx;
 		int i;

-		queue_for_each_hw_ctx(q, hctx, i)
-			cancel_delayed_work_sync(&hctx->delayed_work);
+		queue_for_each_hw_ctx(q, hctx, i) {
+			cancel_delayed_work_sync(&hctx->run_work);
+			cancel_delayed_work_sync(&hctx->delay_work);
+		}
 	} else {
 		cancel_delayed_work_sync(&q->delay_work);
 	}
@ -574,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (!q)
 		return NULL;

-	if (percpu_counter_init(&q->mq_usage_counter, 0))
-		goto fail_q;
-
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
 	if (q->id < 0)
-		goto fail_c;
+		goto fail_q;

 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@ -637,8 +636,6 @@ fail_bdi:
 	bdi_destroy(&q->backing_dev_info);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
-fail_c:
-	percpu_counter_destroy(&q->mq_usage_counter);
 fail_q:
 	kmem_cache_free(blk_requestq_cachep, q);
 	return NULL;
@ -846,6 +843,47 @@ static void freed_request(struct request_list *rl, unsigned int flags)
 		__freed_request(rl, sync ^ 1);
 }

+int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
+{
+	struct request_list *rl;
+
+	spin_lock_irq(q->queue_lock);
+	q->nr_requests = nr;
+	blk_queue_congestion_threshold(q);
+
+	/* congestion isn't cgroup aware and follows root blkcg for now */
+	rl = &q->root_rl;
+
+	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
+		blk_set_queue_congested(q, BLK_RW_SYNC);
+	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, BLK_RW_SYNC);
+
+	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
+		blk_set_queue_congested(q, BLK_RW_ASYNC);
+	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, BLK_RW_ASYNC);
+
+	blk_queue_for_each_rl(rl, q) {
+		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+			blk_set_rl_full(rl, BLK_RW_SYNC);
+		} else {
+			blk_clear_rl_full(rl, BLK_RW_SYNC);
+			wake_up(&rl->wait[BLK_RW_SYNC]);
+		}
+
+		if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+			blk_set_rl_full(rl, BLK_RW_ASYNC);
+		} else {
+			blk_clear_rl_full(rl, BLK_RW_ASYNC);
+			wake_up(&rl->wait[BLK_RW_ASYNC]);
+		}
+	}
+
+	spin_unlock_irq(q->queue_lock);
+	return 0;
+}
+
 /*
 * Determine if elevator data should be initialized when allocating the
 * request associated with @bio.
@ -1135,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	if (q->mq_ops)
-		return blk_mq_alloc_request(q, rw, gfp_mask);
+		return blk_mq_alloc_request(q, rw, gfp_mask, false);
 	else
 		return blk_old_get_request(q, rw, gfp_mask);
 }
@ -1231,12 +1269,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
+	int inflight;
+
 	if (now == part->stamp)
 		return;

-	if (part_in_flight(part)) {
+	inflight = part_in_flight(part);
+	if (inflight) {
 		__part_stat_add(cpu, part, time_in_queue,
-				part_in_flight(part) * (now - part->stamp));
+				inflight * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
@ -1360,7 +1401,6 @@ void blk_add_request_payload(struct request *rq, struct page *page,

 	rq->__data_len = rq->resid_len = len;
 	rq->nr_phys_segments = 1;
-	rq->buffer = bio_data(bio);
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);

@ -1402,12 +1442,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 	bio->bi_next = req->bio;
 	req->bio = bio;

-	/*
-	 * may not be valid. if the low level driver said
-	 * it didn't need a bounce buffer then it better
-	 * not touch req->buffer either...
-	 */
-	req->buffer = bio_data(bio);
 	req->__sector = bio->bi_iter.bi_sector;
 	req->__data_len += bio->bi_iter.bi_size;
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
@ -1432,6 +1466,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 * added on the elevator at this point.  In addition, we don't have
 * reliable access to the elevator outside queue lock.  Only check basic
 * merging parameters without querying the elevator.
+ *
+ * Caller must ensure !blk_queue_nomerges(q) beforehand.
 */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 			    unsigned int *request_count)
@ -1441,9 +1477,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	bool ret = false;
 	struct list_head *plug_list;

-	if (blk_queue_nomerges(q))
-		goto out;
-
 	plug = current->plug;
 	if (!plug)
 		goto out;
@ -1522,7 +1555,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * Check if we can merge with the plugged list before grabbing
 	 * any locks.
 	 */
-	if (blk_attempt_plug_merge(q, bio, &request_count))
+	if (!blk_queue_nomerges(q) &&
+	    blk_attempt_plug_merge(q, bio, &request_count))
 		return;

 	spin_lock_irq(q->queue_lock);
@ -1654,7 +1688,7 @@ static int __init fail_make_request_debugfs(void)
 	struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
 						NULL, &fail_make_request);

-	return IS_ERR(dir) ? PTR_ERR(dir) : 0;
+	return PTR_ERR_OR_ZERO(dir);
 }

 late_initcall(fail_make_request_debugfs);
@ -2434,7 +2468,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	}

 	req->__data_len -= total_bytes;
-	req->buffer = bio_data(req->bio);

 	/* update sector only for requests with clear definition of sector */
 	if (req->cmd_type == REQ_TYPE_FS)
@ -2503,7 +2536,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
 /*
 * queue lock must be held
 */
-static void blk_finish_request(struct request *req, int error)
+void blk_finish_request(struct request *req, int error)
 {
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
@ -2529,6 +2562,7 @@ static void blk_finish_request(struct request *req, int error)
 		__blk_put_request(req->q, req);
 	}
 }
+EXPORT_SYMBOL(blk_finish_request);

 /**
 * blk_end_bidi_request - Complete a bidi request
@ -2752,10 +2786,9 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
 	rq->cmd_flags |= bio->bi_rw & REQ_WRITE;

-	if (bio_has_data(bio)) {
+	if (bio_has_data(bio))
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
-		rq->buffer = bio_data(bio);
-	}
+
 	rq->__data_len = bio->bi_iter.bi_size;
 	rq->bio = rq->biotail = bio;

@ -2831,7 +2864,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);

 /*
 * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.
+ * The actual data parts (e.g. ->cmd, ->sense) are not copied.
 */
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
@ -2857,7 +2890,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 *
 * Description:
 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)
+ *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
 *     are not copied, and copying such parts is the caller's responsibility.
 *     Also, pages which the original bios are pointing to are not copied
 *     and the cloned bios just point same pages.
@ -2904,20 +2937,25 @@ free_and_out:
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);

-int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
+int kblockd_schedule_work(struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
 }
 EXPORT_SYMBOL(kblockd_schedule_work);

-int kblockd_schedule_delayed_work(struct request_queue *q,
-			struct delayed_work *dwork, unsigned long delay)
+int kblockd_schedule_delayed_work(struct delayed_work *dwork,
+				  unsigned long delay)
 {
 	return queue_delayed_work(kblockd_workqueue, dwork, delay);
 }
 EXPORT_SYMBOL(kblockd_schedule_delayed_work);

-#define PLUG_MAGIC	0x91827364
+int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
+				     unsigned long delay)
+{
+	return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);

 /**
 * blk_start_plug - initialize blk_plug and track it inside the task_struct
@ -2937,7 +2975,6 @@ void blk_start_plug(struct blk_plug *plug)
 {
 	struct task_struct *tsk = current;

-	plug->magic = PLUG_MAGIC;
 	INIT_LIST_HEAD(&plug->list);
 	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
@ -3034,8 +3071,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	LIST_HEAD(list);
 	unsigned int depth;

-	BUG_ON(plug->magic != PLUG_MAGIC);
-
 	flush_plug_callbacks(plug, from_schedule);

 	if (!list_empty(&plug->mq_list))
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq)
 	blk_clear_rq_complete(rq);
 }

-static void mq_flush_run(struct work_struct *work)
-{
-	struct request *rq;
-
-	rq = container_of(work, struct request, mq_flush_work);
-
-	memset(&rq->csd, 0, sizeof(rq->csd));
-	blk_mq_insert_request(rq, false, true, false);
-}
-
 static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 {
 	if (rq->q->mq_ops) {
-		INIT_WORK(&rq->mq_flush_work, mq_flush_run);
-		kblockd_schedule_work(rq->q, &rq->mq_flush_work);
+		struct request_queue *q = rq->q;
+
+		blk_mq_add_to_requeue_list(rq, add_front);
+		blk_mq_kick_requeue_list(q);
 		return false;
 	} else {
 		if (add_front)
@ -231,8 +223,10 @@ static void flush_end_io(struct request *flush_rq, int error)
 	struct request *rq, *n;
 	unsigned long flags = 0;

-	if (q->mq_ops)
+	if (q->mq_ops) {
 		spin_lock_irqsave(&q->mq_flush_lock, flags);
+		q->flush_rq->cmd_flags = 0;
+	}

 	running = &q->flush_queue[q->flush_running_idx];
 	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
@ -306,23 +300,9 @@ static bool blk_kick_flush(struct request_queue *q)
 	 */
 	q->flush_pending_idx ^= 1;

-	if (q->mq_ops) {
-		struct blk_mq_ctx *ctx = first_rq->mq_ctx;
-		struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
-		blk_mq_rq_init(hctx, q->flush_rq);
-		q->flush_rq->mq_ctx = ctx;
-
-		/*
-		 * Reuse the tag value from the fist waiting request,
-		 * with blk-mq the tag is generated during request
-		 * allocation and drivers can rely on it being inside
-		 * the range they asked for.
-		 */
-		q->flush_rq->tag = first_rq->tag;
-	} else {
-		blk_rq_init(q, q->flush_rq);
-	}
+	blk_rq_init(q, q->flush_rq);
+	if (q->mq_ops)
+		blk_mq_clone_flush_request(q->flush_rq, first_rq);

 	q->flush_rq->cmd_type = REQ_TYPE_FS;
 	q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete);
 *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()
 *     is called.
 **/
-void blk_iopoll_complete(struct blk_iopoll *iopoll)
+void blk_iopoll_complete(struct blk_iopoll *iop)
 {
 	unsigned long flags;

 	local_irq_save(flags);
-	__blk_iopoll_complete(iopoll);
+	__blk_iopoll_complete(iop);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(blk_iopoll_complete);
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same);
 *  Generate and issue number of bios with zerofiled pages.
 */

-int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask)
+static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+				  sector_t nr_sects, gfp_t gfp_mask)
 {
 	int ret;
 	struct bio *bio;
--- a/block/blk-map.c
+++ b/block/blk-map.c
@ -155,7 +155,6 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 	if (!bio_flagged(bio, BIO_USER_MAPPED))
 		rq->cmd_flags |= REQ_COPY_USER;

-	rq->buffer = NULL;
 	return 0;
 unmap_rq:
 	blk_rq_unmap_user(bio);
@ -238,7 +237,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	blk_queue_bounce(q, &bio);
 	bio_get(bio);
 	blk_rq_bio_prep(q, rq, bio);
-	rq->buffer = NULL;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
@ -325,7 +323,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	}

 	blk_queue_bounce(q, &rq->bio);
-	rq->buffer = NULL;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_kern);
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@ -13,7 +13,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio)
 {
 	struct bio_vec bv, bvprv = { NULL };
-	int cluster, high, highprv = 1;
+	int cluster, high, highprv = 1, no_sg_merge;
 	unsigned int seg_size, nr_phys_segs;
 	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
@ -35,12 +35,21 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	cluster = blk_queue_cluster(q);
 	seg_size = 0;
 	nr_phys_segs = 0;
+	no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
+	high = 0;
 	for_each_bio(bio) {
 		bio_for_each_segment(bv, bio, iter) {
+			/*
+			 * If SG merging is disabled, each bio vector is
+			 * a segment
+			 */
+			if (no_sg_merge)
+				goto new_segment;
+
 			/*
 			 * the trick here is making sure that a high page is
-			 * never considered part of another segment, since that
-			 * might change with the bounce page.
+			 * never considered part of another segment, since
+			 * that might change with the bounce page.
 			 */
 			high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
 			if (!high && !highprv && cluster) {
@ -84,11 +93,16 @@ void blk_recalc_rq_segments(struct request *rq)

 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-	struct bio *nxt = bio->bi_next;
+	if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags))
+		bio->bi_phys_segments = bio->bi_vcnt;
+	else {
+		struct bio *nxt = bio->bi_next;
+
+		bio->bi_next = NULL;
+		bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+		bio->bi_next = nxt;
+	}

-	bio->bi_next = NULL;
-	bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
-	bio->bi_next = nxt;
 	bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
--- a/block/blk-mq-cpu.c
+++ b/block/blk-mq-cpu.c
@ -1,3 +1,8 @@
+/*
+ * CPU notifier helper code for blk-mq
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@ -18,14 +23,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
 {
 	unsigned int cpu = (unsigned long) hcpu;
 	struct blk_mq_cpu_notifier *notify;
+	int ret = NOTIFY_OK;

 	raw_spin_lock(&blk_mq_cpu_notify_lock);

-	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
-		notify->notify(notify->data, action, cpu);
+	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
+		ret = notify->notify(notify->data, action, cpu);
+		if (ret != NOTIFY_OK)
+			break;
+	}

 	raw_spin_unlock(&blk_mq_cpu_notify_lock);
-	return NOTIFY_OK;
+	return ret;
 }

 void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
@ -45,7 +54,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
 }

 void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
-			      void (*fn)(void *, unsigned long, unsigned int),
+			      int (*fn)(void *, unsigned long, unsigned int),
 			      void *data)
 {
 	notifier->notify = fn;
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@ -1,3 +1,8 @@
+/*
+ * CPU <-> hardware queue mapping helpers
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/module.h>
@ -80,19 +85,35 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
 	return 0;
 }

-unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
+unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
 {
 	unsigned int *map;

 	/* If cpus are offline, map them to first hctx */
 	map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
-				reg->numa_node);
+				set->numa_node);
 	if (!map)
 		return NULL;

-	if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
+	if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
 		return map;

 	kfree(map);
 	return NULL;
 }
+
+/*
+ * We have no quick way of doing reverse lookups. This is only used at
+ * queue init time, so runtime isn't important.
+ */
+int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (index == mq_map[i])
+			return cpu_to_node(i);
+	}
+
+	return NUMA_NO_NODE;
+}
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@ -203,59 +203,24 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }

-static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-	ssize_t ret;
-
-	spin_lock(&hctx->lock);
-	ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
-	spin_unlock(&hctx->lock);
-
-	return ret;
-}
-
-static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
-					 const char *page, size_t len)
-{
-	struct blk_mq_ctx *ctx;
-	unsigned long ret;
-	unsigned int i;
-
-	if (kstrtoul(page, 10, &ret)) {
-		pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
-		return -EINVAL;
-	}
-
-	spin_lock(&hctx->lock);
-	if (ret)
-		hctx->flags |= BLK_MQ_F_SHOULD_IPI;
-	else
-		hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
-	spin_unlock(&hctx->lock);
-
-	hctx_for_each_ctx(hctx, ctx, i)
-		ctx->ipi_redirect = !!ret;
-
-	return len;
-}
-
 static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
 	return blk_mq_tag_sysfs_show(hctx->tags, page);
 }

+static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
+}
+
 static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
-	unsigned int i, queue_num, first = 1;
+	unsigned int i, first = 1;
 	ssize_t ret = 0;

 	blk_mq_disable_hotplug();

-	for_each_online_cpu(i) {
-		queue_num = hctx->queue->mq_map[i];
-		if (queue_num != hctx->queue_num)
-			continue;
-
+	for_each_cpu(i, hctx->cpumask) {
 		if (first)
 			ret += sprintf(ret + page, "%u", i);
 		else
@ -307,15 +272,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
 	.attr = {.name = "dispatched", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_dispatched_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
+	.attr = {.name = "active", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_active_show,
+};
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
 	.attr = {.name = "pending", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_rq_list_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
-	.attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
-	.show = blk_mq_hw_sysfs_ipi_show,
-	.store = blk_mq_hw_sysfs_ipi_store,
-};
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
 	.attr = {.name = "tags", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_tags_show,
@ -330,9 +294,9 @@ static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_run.attr,
 	&blk_mq_hw_sysfs_dispatched.attr,
 	&blk_mq_hw_sysfs_pending.attr,
-	&blk_mq_hw_sysfs_ipi.attr,
 	&blk_mq_hw_sysfs_tags.attr,
 	&blk_mq_hw_sysfs_cpus.attr,
+	&blk_mq_hw_sysfs_active.attr,
 	NULL,
 };

@ -363,6 +327,42 @@ static struct kobj_type blk_mq_hw_ktype = {
 	.release	= blk_mq_sysfs_release,
 };

+static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_ctx *ctx;
+	int i;
+
+	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+		return;
+
+	hctx_for_each_ctx(hctx, ctx, i)
+		kobject_del(&ctx->kobj);
+
+	kobject_del(&hctx->kobj);
+}
+
+static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_ctx *ctx;
+	int i, ret;
+
+	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+		return 0;
+
+	ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
+	if (ret)
+		return ret;
+
+	hctx_for_each_ctx(hctx, ctx, i) {
+		ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
 void blk_mq_unregister_disk(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
@ -371,11 +371,11 @@ void blk_mq_unregister_disk(struct gendisk *disk)
 	int i, j;

 	queue_for_each_hw_ctx(q, hctx, i) {
-		hctx_for_each_ctx(hctx, ctx, j) {
-			kobject_del(&ctx->kobj);
+		blk_mq_unregister_hctx(hctx);
+
+		hctx_for_each_ctx(hctx, ctx, j)
 			kobject_put(&ctx->kobj);
-		}
-		kobject_del(&hctx->kobj);
+
 		kobject_put(&hctx->kobj);
 	}

@ -386,15 +386,30 @@ void blk_mq_unregister_disk(struct gendisk *disk)
 	kobject_put(&disk_to_dev(disk)->kobj);
 }

+static void blk_mq_sysfs_init(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	int i, j;
+
+	kobject_init(&q->mq_kobj, &blk_mq_ktype);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+
+		hctx_for_each_ctx(hctx, ctx, j)
+			kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
+	}
+}
+
 int blk_mq_register_disk(struct gendisk *disk)
 {
 	struct device *dev = disk_to_dev(disk);
 	struct request_queue *q = disk->queue;
 	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-	int ret, i, j;
+	int ret, i;

-	kobject_init(&q->mq_kobj, &blk_mq_ktype);
+	blk_mq_sysfs_init(q);

 	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
 	if (ret < 0)
@ -403,20 +418,10 @@ int blk_mq_register_disk(struct gendisk *disk)
 	kobject_uevent(&q->mq_kobj, KOBJ_ADD);

 	queue_for_each_hw_ctx(q, hctx, i) {
-		kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
-		ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
+		hctx->flags |= BLK_MQ_F_SYSFS_UP;
+		ret = blk_mq_register_hctx(hctx);
 		if (ret)
 			break;
-
-		if (!hctx->nr_ctx)
-			continue;
-
-		hctx_for_each_ctx(hctx, ctx, j) {
-			kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
-			ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
-			if (ret)
-				break;
-		}
 	}

 	if (ret) {
@ -426,3 +431,26 @@ int blk_mq_register_disk(struct gendisk *disk)

 	return 0;
 }
+
+void blk_mq_sysfs_unregister(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_unregister_hctx(hctx);
+}
+
+int blk_mq_sysfs_register(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i, ret = 0;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		ret = blk_mq_register_hctx(hctx);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@ -1,78 +1,345 @@
+/*
+ * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
+ * over multiple cachelines to avoid ping-pong between multiple submitters
+ * or submitter and completer. Uses rolling wakeups to avoid falling of
+ * the scaling cliff when we run out of tags and have to start putting
+ * submitters to sleep.
+ *
+ * Uses active queue tracking to support fairer distribution of tags
+ * between multiple submitters when a shared tag map is used.
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/percpu_ida.h>
+#include <linux/random.h>

 #include <linux/blk-mq.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"

-/*
- * Per tagged queue (tag address space) map
- */
-struct blk_mq_tags {
-	unsigned int nr_tags;
-	unsigned int nr_reserved_tags;
-	unsigned int nr_batch_move;
-	unsigned int nr_max_cache;
-
-	struct percpu_ida free_tags;
-	struct percpu_ida reserved_tags;
-};
-
-void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
+static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
 {
-	int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
-	blk_mq_put_tag(tags, tag);
+	int i;
+
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+		int ret;
+
+		ret = find_first_zero_bit(&bm->word, bm->depth);
+		if (ret < bm->depth)
+			return true;
+	}
+
+	return false;
 }

 bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 {
-	return !tags ||
-		percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
+	if (!tags)
+		return true;
+
+	return bt_has_free_tags(&tags->bitmap_tags);
 }

-static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
+static inline void bt_index_inc(unsigned int *index)
+{
+	*index = (*index + 1) & (BT_WAIT_QUEUES - 1);
+}
+
+/*
+ * If a previously inactive queue goes active, bump the active user count.
+ */
+bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
+	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		atomic_inc(&hctx->tags->active_queues);
+
+	return true;
+}
+
+/*
+ * Wakeup all potentially sleeping on normal (non-reserved) tags
+ */
+static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
+{
+	struct blk_mq_bitmap_tags *bt;
+	int i, wake_index;
+
+	bt = &tags->bitmap_tags;
+	wake_index = bt->wake_index;
+	for (i = 0; i < BT_WAIT_QUEUES; i++) {
+		struct bt_wait_state *bs = &bt->bs[wake_index];
+
+		if (waitqueue_active(&bs->wait))
+			wake_up(&bs->wait);
+
+		bt_index_inc(&wake_index);
+	}
+}
+
+/*
+ * If a previously busy queue goes inactive, potential waiters could now
+ * be allowed to queue. Wake them up and check.
+ */
+void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_tags *tags = hctx->tags;
+
+	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		return;
+
+	atomic_dec(&tags->active_queues);
+
+	blk_mq_tag_wakeup_all(tags);
+}
+
+/*
+ * For shared tag users, we track the number of currently active users
+ * and attempt to provide a fair share of the tag depth for each of them.
+ */
+static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
+				  struct blk_mq_bitmap_tags *bt)
+{
+	unsigned int depth, users;
+
+	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		return true;
+	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		return true;
+
+	/*
+	 * Don't try dividing an ant
+	 */
+	if (bt->depth == 1)
+		return true;
+
+	users = atomic_read(&hctx->tags->active_queues);
+	if (!users)
+		return true;
+
+	/*
+	 * Allow at least some tags
+	 */
+	depth = max((bt->depth + users - 1) / users, 4U);
+	return atomic_read(&hctx->nr_active) < depth;
+}
+
+static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
+{
+	int tag, org_last_tag, end;
+
+	org_last_tag = last_tag;
+	end = bm->depth;
+	do {
+restart:
+		tag = find_next_zero_bit(&bm->word, end, last_tag);
+		if (unlikely(tag >= end)) {
+			/*
+			 * We started with an offset, start from 0 to
+			 * exhaust the map.
+			 */
+			if (org_last_tag && last_tag) {
+				end = last_tag;
+				last_tag = 0;
+				goto restart;
+			}
+			return -1;
+		}
+		last_tag = tag + 1;
+	} while (test_and_set_bit_lock(tag, &bm->word));
+
+	return tag;
+}
+
+/*
+ * Straight forward bitmap tag implementation, where each bit is a tag
+ * (cleared == free, and set == busy). The small twist is using per-cpu
+ * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
+ * contexts. This enables us to drastically limit the space searched,
+ * without dirtying an extra shared cacheline like we would if we stored
+ * the cache value inside the shared blk_mq_bitmap_tags structure. On top
+ * of that, each word of tags is in a separate cacheline. This means that
+ * multiple users will tend to stick to different cachelines, at least
+ * until the map is exhausted.
+ */
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
+		    unsigned int *tag_cache)
+{
+	unsigned int last_tag, org_last_tag;
+	int index, i, tag;
+
+	if (!hctx_may_queue(hctx, bt))
+		return -1;
+
+	last_tag = org_last_tag = *tag_cache;
+	index = TAG_TO_INDEX(bt, last_tag);
+
+	for (i = 0; i < bt->map_nr; i++) {
+		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
+		if (tag != -1) {
+			tag += (index << bt->bits_per_word);
+			goto done;
+		}
+
+		last_tag = 0;
+		if (++index >= bt->map_nr)
+			index = 0;
+	}
+
+	*tag_cache = 0;
+	return -1;
+
+	/*
+	 * Only update the cache from the allocation path, if we ended
+	 * up using the specific cached tag.
+	 */
+done:
+	if (tag == org_last_tag) {
+		last_tag = tag + 1;
+		if (last_tag >= bt->depth - 1)
+			last_tag = 0;
+
+		*tag_cache = last_tag;
+	}
+
+	return tag;
+}
+
+static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
+					 struct blk_mq_hw_ctx *hctx)
+{
+	struct bt_wait_state *bs;
+
+	if (!hctx)
+		return &bt->bs[0];
+
+	bs = &bt->bs[hctx->wait_index];
+	bt_index_inc(&hctx->wait_index);
+	return bs;
+}
+
+static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
+		  unsigned int *last_tag, gfp_t gfp)
+{
+	struct bt_wait_state *bs;
+	DEFINE_WAIT(wait);
+	int tag;
+
+	tag = __bt_get(hctx, bt, last_tag);
+	if (tag != -1)
+		return tag;
+
+	if (!(gfp & __GFP_WAIT))
+		return -1;
+
+	bs = bt_wait_ptr(bt, hctx);
+	do {
+		bool was_empty;
+
+		was_empty = list_empty(&wait.task_list);
+		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+		tag = __bt_get(hctx, bt, last_tag);
+		if (tag != -1)
+			break;
+
+		if (was_empty)
+			atomic_set(&bs->wait_cnt, bt->wake_cnt);
+
+		io_schedule();
+	} while (1);
+
+	finish_wait(&bs->wait, &wait);
+	return tag;
+}
+
+static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags,
+				     struct blk_mq_hw_ctx *hctx,
+				     unsigned int *last_tag, gfp_t gfp)
 {
 	int tag;

-	tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ?
-			       TASK_UNINTERRUPTIBLE : TASK_RUNNING);
-	if (tag < 0)
-		return BLK_MQ_TAG_FAIL;
-	return tag + tags->nr_reserved_tags;
+	tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp);
+	if (tag >= 0)
+		return tag + tags->nr_reserved_tags;
+
+	return BLK_MQ_TAG_FAIL;
 }

 static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
 					      gfp_t gfp)
 {
-	int tag;
+	int tag, zero = 0;

 	if (unlikely(!tags->nr_reserved_tags)) {
 		WARN_ON_ONCE(1);
 		return BLK_MQ_TAG_FAIL;
 	}

-	tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ?
-			       TASK_UNINTERRUPTIBLE : TASK_RUNNING);
+	tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp);
 	if (tag < 0)
 		return BLK_MQ_TAG_FAIL;
+
 	return tag;
 }

-unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
+unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
+			    gfp_t gfp, bool reserved)
 {
 	if (!reserved)
-		return __blk_mq_get_tag(tags, gfp);
+		return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);

-	return __blk_mq_get_reserved_tag(tags, gfp);
+	return __blk_mq_get_reserved_tag(hctx->tags, gfp);
+}
+
+static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
+{
+	int i, wake_index;
+
+	wake_index = bt->wake_index;
+	for (i = 0; i < BT_WAIT_QUEUES; i++) {
+		struct bt_wait_state *bs = &bt->bs[wake_index];
+
+		if (waitqueue_active(&bs->wait)) {
+			if (wake_index != bt->wake_index)
+				bt->wake_index = wake_index;
+
+			return bs;
+		}
+
+		bt_index_inc(&wake_index);
+	}
+
+	return NULL;
+}
+
+static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
+{
+	const int index = TAG_TO_INDEX(bt, tag);
+	struct bt_wait_state *bs;
+
+	/*
+	 * The unlock memory barrier need to order access to req in free
+	 * path and clearing tag bit
+	 */
+	clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
+
+	bs = bt_wake_ptr(bt);
+	if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
+		atomic_set(&bs->wait_cnt, bt->wake_cnt);
+		bt_index_inc(&bt->wake_index);
+		wake_up(&bs->wait);
+	}
 }

 static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
 {
 	BUG_ON(tag >= tags->nr_tags);

-	percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
+	bt_clear_tag(&tags->bitmap_tags, tag);
 }

 static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
@ -80,22 +347,43 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
 {
 	BUG_ON(tag >= tags->nr_reserved_tags);

-	percpu_ida_free(&tags->reserved_tags, tag);
+	bt_clear_tag(&tags->breserved_tags, tag);
 }

-void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
+		    unsigned int *last_tag)
 {
-	if (tag >= tags->nr_reserved_tags)
-		__blk_mq_put_tag(tags, tag);
-	else
+	struct blk_mq_tags *tags = hctx->tags;
+
+	if (tag >= tags->nr_reserved_tags) {
+		const int real_tag = tag - tags->nr_reserved_tags;
+
+		__blk_mq_put_tag(tags, real_tag);
+		*last_tag = real_tag;
+	} else
 		__blk_mq_put_reserved_tag(tags, tag);
 }

-static int __blk_mq_tag_iter(unsigned id, void *data)
+static void bt_for_each_free(struct blk_mq_bitmap_tags *bt,
+			     unsigned long *free_map, unsigned int off)
 {
-	unsigned long *tag_map = data;
-	__set_bit(id, tag_map);
-	return 0;
+	int i;
+
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+		int bit = 0;
+
+		do {
+			bit = find_next_zero_bit(&bm->word, bm->depth, bit);
+			if (bit >= bm->depth)
+				break;
+
+			__set_bit(bit + off, free_map);
+			bit++;
+		} while (1);
+
+		off += (1 << bt->bits_per_word);
+	}
 }

 void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
@ -109,21 +397,128 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
 	if (!tag_map)
 		return;

-	percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
+	bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags);
 	if (tags->nr_reserved_tags)
-		percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
-			tag_map);
+		bt_for_each_free(&tags->breserved_tags, tag_map, 0);

 	fn(data, tag_map);
 	kfree(tag_map);
 }
+EXPORT_SYMBOL(blk_mq_tag_busy_iter);
+
+static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
+{
+	unsigned int i, used;
+
+	for (i = 0, used = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+
+		used += bitmap_weight(&bm->word, bm->depth);
+	}
+
+	return bt->depth - used;
+}
+
+static void bt_update_count(struct blk_mq_bitmap_tags *bt,
+			    unsigned int depth)
+{
+	unsigned int tags_per_word = 1U << bt->bits_per_word;
+	unsigned int map_depth = depth;
+
+	if (depth) {
+		int i;
+
+		for (i = 0; i < bt->map_nr; i++) {
+			bt->map[i].depth = min(map_depth, tags_per_word);
+			map_depth -= bt->map[i].depth;
+		}
+	}
+
+	bt->wake_cnt = BT_WAIT_BATCH;
+	if (bt->wake_cnt > depth / 4)
+		bt->wake_cnt = max(1U, depth / 4);
+
+	bt->depth = depth;
+}
+
+static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
+			int node, bool reserved)
+{
+	int i;
+
+	bt->bits_per_word = ilog2(BITS_PER_LONG);
+
+	/*
+	 * Depth can be zero for reserved tags, that's not a failure
+	 * condition.
+	 */
+	if (depth) {
+		unsigned int nr, tags_per_word;
+
+		tags_per_word = (1 << bt->bits_per_word);
+
+		/*
+		 * If the tag space is small, shrink the number of tags
+		 * per word so we spread over a few cachelines, at least.
+		 * If less than 4 tags, just forget about it, it's not
+		 * going to work optimally anyway.
+		 */
+		if (depth >= 4) {
+			while (tags_per_word * 4 > depth) {
+				bt->bits_per_word--;
+				tags_per_word = (1 << bt->bits_per_word);
+			}
+		}
+
+		nr = ALIGN(depth, tags_per_word) / tags_per_word;
+		bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
+						GFP_KERNEL, node);
+		if (!bt->map)
+			return -ENOMEM;
+
+		bt->map_nr = nr;
+	}
+
+	bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
+	if (!bt->bs) {
+		kfree(bt->map);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < BT_WAIT_QUEUES; i++)
+		init_waitqueue_head(&bt->bs[i].wait);
+
+	bt_update_count(bt, depth);
+	return 0;
+}
+
+static void bt_free(struct blk_mq_bitmap_tags *bt)
+{
+	kfree(bt->map);
+	kfree(bt->bs);
+}
+
+static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
+						   int node)
+{
+	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+
+	if (bt_alloc(&tags->bitmap_tags, depth, node, false))
+		goto enomem;
+	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
+		goto enomem;
+
+	return tags;
+enomem:
+	bt_free(&tags->bitmap_tags);
+	kfree(tags);
+	return NULL;
+}

 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 				     unsigned int reserved_tags, int node)
 {
-	unsigned int nr_tags, nr_cache;
 	struct blk_mq_tags *tags;
-	int ret;

 	if (total_tags > BLK_MQ_TAG_MAX) {
 		pr_err("blk-mq: tag depth too large\n");
@ -134,73 +529,59 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;

-	nr_tags = total_tags - reserved_tags;
-	nr_cache = nr_tags / num_possible_cpus();
-
-	if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
-		nr_cache = BLK_MQ_TAG_CACHE_MIN;
-	else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
-		nr_cache = BLK_MQ_TAG_CACHE_MAX;
-
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
-	tags->nr_max_cache = nr_cache;
-	tags->nr_batch_move = max(1u, nr_cache / 2);

-	ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
-				tags->nr_reserved_tags,
-				tags->nr_max_cache,
-				tags->nr_batch_move);
-	if (ret)
-		goto err_free_tags;
-
-	if (reserved_tags) {
-		/*
-		 * With max_cahe and batch set to 1, the allocator fallbacks to
-		 * no cached. It's fine reserved tags allocation is slow.
-		 */
-		ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
-				1, 1);
-		if (ret)
-			goto err_reserved_tags;
-	}
-
-	return tags;
-
-err_reserved_tags:
-	percpu_ida_destroy(&tags->free_tags);
-err_free_tags:
-	kfree(tags);
-	return NULL;
+	return blk_mq_init_bitmap_tags(tags, node);
 }

 void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
-	percpu_ida_destroy(&tags->free_tags);
-	percpu_ida_destroy(&tags->reserved_tags);
+	bt_free(&tags->bitmap_tags);
+	bt_free(&tags->breserved_tags);
 	kfree(tags);
 }

+void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
+{
+	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+
+	*tag = prandom_u32() % depth;
+}
+
+int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
+{
+	tdepth -= tags->nr_reserved_tags;
+	if (tdepth > tags->nr_tags)
+		return -EINVAL;
+
+	/*
+	 * Don't need (or can't) update reserved tags here, they remain
+	 * static and should never need resizing.
+	 */
+	bt_update_count(&tags->bitmap_tags, tdepth);
+	blk_mq_tag_wakeup_all(tags);
+	return 0;
+}
+
 ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
 {
 	char *orig_page = page;
-	unsigned int cpu;
+	unsigned int free, res;

 	if (!tags)
 		return 0;

-	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
-			" max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
-			tags->nr_batch_move, tags->nr_max_cache);
+	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
+			"bits_per_word=%u\n",
+			tags->nr_tags, tags->nr_reserved_tags,
+			tags->bitmap_tags.bits_per_word);

-	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
-			percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
-			percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
+	free = bt_unused_tags(&tags->bitmap_tags);
+	res = bt_unused_tags(&tags->breserved_tags);

-	for_each_possible_cpu(cpu) {
-		page += sprintf(page, "  cpu%02u: nr_free=%u\n", cpu,
-				percpu_ida_free_tags(&tags->free_tags, cpu));
-	}
+	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
+	page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));

 	return page - orig_page;
 }
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@ -1,17 +1,59 @@
 #ifndef INT_BLK_MQ_TAG_H
 #define INT_BLK_MQ_TAG_H

-struct blk_mq_tags;
+#include "blk-mq.h"
+
+enum {
+	BT_WAIT_QUEUES	= 8,
+	BT_WAIT_BATCH	= 8,
+};
+
+struct bt_wait_state {
+	atomic_t wait_cnt;
+	wait_queue_head_t wait;
+} ____cacheline_aligned_in_smp;
+
+#define TAG_TO_INDEX(bt, tag)	((tag) >> (bt)->bits_per_word)
+#define TAG_TO_BIT(bt, tag)	((tag) & ((1 << (bt)->bits_per_word) - 1))
+
+struct blk_mq_bitmap_tags {
+	unsigned int depth;
+	unsigned int wake_cnt;
+	unsigned int bits_per_word;
+
+	unsigned int map_nr;
+	struct blk_align_bitmap *map;
+
+	unsigned int wake_index;
+	struct bt_wait_state *bs;
+};
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+	unsigned int nr_tags;
+	unsigned int nr_reserved_tags;
+
+	atomic_t active_queues;
+
+	struct blk_mq_bitmap_tags bitmap_tags;
+	struct blk_mq_bitmap_tags breserved_tags;
+
+	struct request **rqs;
+	struct list_head page_list;
+};
+

 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);

-extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved);
-extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags);
-extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
-extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
+extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
+extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
+extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);

 enum {
 	BLK_MQ_TAG_CACHE_MIN	= 1,
@ -24,4 +66,23 @@ enum {
 	BLK_MQ_TAG_MAX		= BLK_MQ_TAG_FAIL - 1,
 };

+extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
+
+static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		return false;
+
+	return __blk_mq_tag_busy(hctx);
+}
+
+static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		return;
+
+	__blk_mq_tag_idle(hctx);
+}
+
 #endif
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@ -1,6 +1,8 @@
 #ifndef INT_BLK_MQ_H
 #define INT_BLK_MQ_H

+struct blk_mq_tag_set;
+
 struct blk_mq_ctx {
 	struct {
 		spinlock_t		lock;
@ -9,7 +11,8 @@ struct blk_mq_ctx {

 	unsigned int		cpu;
 	unsigned int		index_hw;
-	unsigned int		ipi_redirect;
+
+	unsigned int		last_tag ____cacheline_aligned_in_smp;

 	/* incremented at dispatch time */
 	unsigned long		rq_dispatched[2];
@ -20,21 +23,23 @@ struct blk_mq_ctx {

 	struct request_queue	*queue;
 	struct kobject		kobj;
-};
+} ____cacheline_aligned_in_smp;

 void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_init_flush(struct request_queue *q);
 void blk_mq_drain_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
-void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq);
+void blk_mq_clone_flush_request(struct request *flush_rq,
+		struct request *orig_rq);
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);

 /*
 * CPU hotplug helpers
 */
 struct blk_mq_cpu_notifier;
 void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
-			      void (*fn)(void *, unsigned long, unsigned int),
+			      int (*fn)(void *, unsigned long, unsigned int),
 			      void *data);
 void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
 void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
@ -45,10 +50,23 @@ void blk_mq_disable_hotplug(void);
 /*
 * CPU -> queue mappings
 */
-struct blk_mq_reg;
-extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
+extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
 extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);

-void blk_mq_add_timer(struct request *rq);
+/*
+ * sysfs helpers
+ */
+extern int blk_mq_sysfs_register(struct request_queue *q);
+extern void blk_mq_sysfs_unregister(struct request_queue *q);
+
+/*
+ * Basic implementation of sparser bitmap, allowing the user to spread
+ * the bits over more cachelines.
+ */
+struct blk_align_bitmap {
+	unsigned long word;
+	unsigned long depth;
+} ____cacheline_aligned_in_smp;

 #endif
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
 static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
-	struct request_list *rl;
 	unsigned long nr;
-	int ret;
+	int ret, err;

-	if (!q->request_fn)
+	if (!q->request_fn && !q->mq_ops)
 		return -EINVAL;

 	ret = queue_var_store(&nr, page, count);
@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	if (nr < BLKDEV_MIN_RQ)
 		nr = BLKDEV_MIN_RQ;

-	spin_lock_irq(q->queue_lock);
-	q->nr_requests = nr;
-	blk_queue_congestion_threshold(q);
+	if (q->request_fn)
+		err = blk_update_nr_requests(q, nr);
+	else
+		err = blk_mq_update_nr_requests(q, nr);

-	/* congestion isn't cgroup aware and follows root blkcg for now */
-	rl = &q->root_rl;
+	if (err)
+		return err;

-	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_SYNC);
-	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_SYNC);
-
-	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_ASYNC);
-	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_ASYNC);
-
-	blk_queue_for_each_rl(rl, q) {
-		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
-			blk_set_rl_full(rl, BLK_RW_SYNC);
-		} else {
-			blk_clear_rl_full(rl, BLK_RW_SYNC);
-			wake_up(&rl->wait[BLK_RW_SYNC]);
-		}
-
-		if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
-			blk_set_rl_full(rl, BLK_RW_ASYNC);
-		} else {
-			blk_clear_rl_full(rl, BLK_RW_ASYNC);
-			wake_up(&rl->wait[BLK_RW_ASYNC]);
-		}
-	}
-
-	spin_unlock_irq(q->queue_lock);
 	return ret;
 }

@ -544,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);

-	percpu_counter_destroy(&q->mq_usage_counter);
-
 	if (q->mq_ops)
 		blk_mq_free_queue(q);

--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
 static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
 {
 	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
-		return 0;
+		return false;

 	return 1;
 }
@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
 	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
 			*wait = 0;
-		return 1;
+		return true;
 	}

 	/* Calc approx time to dispatch */
@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
 	if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
 		if (wait)
 			*wait = 0;
-		return 1;
+		return true;
 	}

 	/* Calc approx time to dispatch */
@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
 		if (wait)
 			*wait = 0;
-		return 1;
+		return true;
 	}

 	/*
@ -1258,7 +1258,7 @@ out_unlock:
 * of throtl_data->service_queue.  Those bio's are ready and issued by this
 * function.
 */
-void blk_throtl_dispatch_work_fn(struct work_struct *work)
+static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 {
 	struct throtl_data *td = container_of(work, struct throtl_data,
 					      dispatch_work);
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req)
 			__blk_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
-		if (q->mq_ops)
-			blk_mq_add_timer(req);
-		else
-			blk_add_timer(req);
-
+		blk_add_timer(req);
 		blk_clear_rq_complete(req);
 		break;
 	case BLK_EH_NOT_HANDLED:
@ -170,7 +166,26 @@ void blk_abort_request(struct request *req)
 }
 EXPORT_SYMBOL_GPL(blk_abort_request);

-void __blk_add_timer(struct request *req, struct list_head *timeout_list)
+unsigned long blk_rq_timeout(unsigned long timeout)
+{
+	unsigned long maxt;
+
+	maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
+	if (time_after(timeout, maxt))
+		timeout = maxt;
+
+	return timeout;
+}
+
+/**
+ * blk_add_timer - Start timeout timer for a single request
+ * @req:	request that is about to start running.
+ *
+ * Notes:
+ *    Each request has its own timer, and as it is added to the queue, we
+ *    set up the timer. When the request completes, we cancel the timer.
+ */
+void blk_add_timer(struct request *req)
 {
 	struct request_queue *q = req->q;
 	unsigned long expiry;
@ -188,32 +203,29 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list)
 		req->timeout = q->rq_timeout;

 	req->deadline = jiffies + req->timeout;
-	if (timeout_list)
-		list_add_tail(&req->timeout_list, timeout_list);
+	if (!q->mq_ops)
+		list_add_tail(&req->timeout_list, &req->q->timeout_list);

 	/*
 	 * If the timer isn't already pending or this timeout is earlier
 	 * than an existing one, modify the timer. Round up to next nearest
 	 * second.
 	 */
-	expiry = round_jiffies_up(req->deadline);
+	expiry = blk_rq_timeout(round_jiffies_up(req->deadline));

 	if (!timer_pending(&q->timeout) ||
-	    time_before(expiry, q->timeout.expires))
-		mod_timer(&q->timeout, expiry);
+	    time_before(expiry, q->timeout.expires)) {
+		unsigned long diff = q->timeout.expires - expiry;
+
+		/*
+		 * Due to added timer slack to group timers, the timer
+		 * will often be a little in front of what we asked for.
+		 * So apply some tolerance here too, otherwise we keep
+		 * modifying the timer because expires for value X
+		 * will be X + something.
+		 */
+		if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
+			mod_timer(&q->timeout, expiry);
+	}

 }
-
-/**
- * blk_add_timer - Start timeout timer for a single request
- * @req:	request that is about to start running.
- *
- * Notes:
- *    Each request has its own timer, and as it is added to the queue, we
- *    set up the timer. When the request completes, we cancel the timer.
- */
-void blk_add_timer(struct request *req)
-{
-	__blk_add_timer(req, &req->q->timeout_list);
-}
-
--- a/block/blk.h
+++ b/block/blk.h
@ -9,6 +9,9 @@
 /* Number of requests a "batching" process may submit */
 #define BLK_BATCH_REQ	32

+/* Max future timer expiry for timeouts */
+#define BLK_MAX_TIMEOUT		(5 * HZ)
+
 extern struct kmem_cache *blk_requestq_cachep;
 extern struct kmem_cache *request_cachep;
 extern struct kobj_type blk_queue_ktype;
@ -37,9 +40,9 @@ bool __blk_end_bidi_request(struct request *rq, int error,
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
 			  unsigned int *next_set);
-void __blk_add_timer(struct request *req, struct list_head *timeout_list);
+unsigned long blk_rq_timeout(unsigned long timeout);
+void blk_add_timer(struct request *req);
 void blk_delete_timer(struct request *);
-void blk_add_timer(struct request *);


 bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
@ -185,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }

+extern int blk_update_nr_requests(struct request_queue *, unsigned int);
+
 /*
 * Contribute to IO statistics IFF:
 *
--- a/block/bounce.c
+++ b/block/bounce.c
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@ -908,7 +908,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 {
 	if (cfqd->busy_queues) {
 		cfq_log(cfqd, "schedule dispatch");
-		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
+		kblockd_schedule_work(&cfqd->unplug_work);
 	}
 }

@ -4460,7 +4460,7 @@ out_free:
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 {
-	return sprintf(page, "%d\n", var);
+	return sprintf(page, "%u\n", var);
 }

 static ssize_t
--- a/block/ioprio.c
+++ b/block/ioprio.c
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@ -205,10 +205,6 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
 	if (capable(CAP_SYS_RAWIO))
 		return 0;

-	/* if there's no filter set, assume we're filtering everything out */
-	if (!filter)
-		return -EPERM;
-
 	/* Anybody who can open the device can do a read-safe command */
 	if (test_bit(cmd[0], filter->read_ok))
 		return 0;
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@ -1406,7 +1406,7 @@ next_segment:

 		track = block / (floppy->dtype->sects * floppy->type->sect_mult);
 		sector = block % (floppy->dtype->sects * floppy->type->sect_mult);
-		data = rq->buffer + 512 * cnt;
+		data = bio_data(rq->bio) + 512 * cnt;
 #ifdef DEBUG
 		printk("access to track %d, sector %d, with buffer at "
 		       "0x%08lx\n", track, sector, data);
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@ -1484,7 +1484,7 @@ repeat:
 	ReqCnt = 0;
 	ReqCmd = rq_data_dir(fd_request);
 	ReqBlock = blk_rq_pos(fd_request);
-	ReqBuffer = fd_request->buffer;
+	ReqBuffer = bio_data(fd_request->bio);
 	setup_req_params( drive );
 	do_fd_action( drive );

--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@ -2351,7 +2351,7 @@ static void rw_interrupt(void)
 	}

 	if (CT(COMMAND) != FD_READ ||
-	    raw_cmd->kernel_data == current_req->buffer) {
+	    raw_cmd->kernel_data == bio_data(current_req->bio)) {
 		/* transfer directly from buffer */
 		cont->done(1);
 	} else if (CT(COMMAND) == FD_READ) {
@ -2640,7 +2640,7 @@ static int make_raw_rw_request(void)
 		raw_cmd->flags &= ~FD_RAW_WRITE;
 		raw_cmd->flags |= FD_RAW_READ;
 		COMMAND = FM_MODE(_floppy, FD_READ);
-	} else if ((unsigned long)current_req->buffer < MAX_DMA_ADDRESS) {
+	} else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) {
 		unsigned long dma_limit;
 		int direct, indirect;

@ -2654,13 +2654,13 @@ static int make_raw_rw_request(void)
 		 */
 		max_size = buffer_chain_size();
 		dma_limit = (MAX_DMA_ADDRESS -
-			     ((unsigned long)current_req->buffer)) >> 9;
+			     ((unsigned long)bio_data(current_req->bio))) >> 9;
 		if ((unsigned long)max_size > dma_limit)
 			max_size = dma_limit;
 		/* 64 kb boundaries */
-		if (CROSS_64KB(current_req->buffer, max_size << 9))
+		if (CROSS_64KB(bio_data(current_req->bio), max_size << 9))
 			max_size = (K_64 -
-				    ((unsigned long)current_req->buffer) %
+				    ((unsigned long)bio_data(current_req->bio)) %
 				    K_64) >> 9;
 		direct = transfer_size(ssize, max_sector, max_size) - fsector_t;
 		/*
@ -2677,7 +2677,7 @@ static int make_raw_rw_request(void)
 		       (DP->read_track & (1 << DRS->probed_format)))))) {
 			max_size = blk_rq_sectors(current_req);
 		} else {
-			raw_cmd->kernel_data = current_req->buffer;
+			raw_cmd->kernel_data = bio_data(current_req->bio);
 			raw_cmd->length = current_count_sectors << 9;
 			if (raw_cmd->length == 0) {
 				DPRINT("%s: zero dma transfer attempted\n", __func__);
@ -2731,7 +2731,7 @@ static int make_raw_rw_request(void)
 	raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1;
 	raw_cmd->length <<= 9;
 	if ((raw_cmd->length < current_count_sectors << 9) ||
-	    (raw_cmd->kernel_data != current_req->buffer &&
+	    (raw_cmd->kernel_data != bio_data(current_req->bio) &&
 	     CT(COMMAND) == FD_WRITE &&
 	     (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max ||
 	      aligned_sector_t < buffer_min)) ||
@ -2739,7 +2739,7 @@ static int make_raw_rw_request(void)
 	    raw_cmd->length <= 0 || current_count_sectors <= 0) {
 		DPRINT("fractionary current count b=%lx s=%lx\n",
 		       raw_cmd->length, current_count_sectors);
-		if (raw_cmd->kernel_data != current_req->buffer)
+		if (raw_cmd->kernel_data != bio_data(current_req->bio))
 			pr_info("addr=%d, length=%ld\n",
 				(int)((raw_cmd->kernel_data -
 				       floppy_track_buffer) >> 9),
@ -2756,7 +2756,7 @@ static int make_raw_rw_request(void)
 		return 0;
 	}

-	if (raw_cmd->kernel_data != current_req->buffer) {
+	if (raw_cmd->kernel_data != bio_data(current_req->bio)) {
 		if (raw_cmd->kernel_data < floppy_track_buffer ||
 		    current_count_sectors < 0 ||
 		    raw_cmd->length < 0 ||
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@ -464,11 +464,11 @@ static void read_intr(void)

 ok_to_read:
 	req = hd_req;
-	insw(HD_DATA, req->buffer, 256);
+	insw(HD_DATA, bio_data(req->bio), 256);
 #ifdef DEBUG
 	printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
 	       req->rq_disk->disk_name, blk_rq_pos(req) + 1,
-	       blk_rq_sectors(req) - 1, req->buffer+512);
+	       blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
 #endif
 	if (hd_end_request(0, 512)) {
 		SET_HANDLER(&read_intr);
@ -505,7 +505,7 @@ static void write_intr(void)
 ok_to_write:
 	if (hd_end_request(0, 512)) {
 		SET_HANDLER(&write_intr);
-		outsw(HD_DATA, req->buffer, 256);
+		outsw(HD_DATA, bio_data(req->bio), 256);
 		return;
 	}

@ -624,7 +624,7 @@ repeat:
 	printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
 		req->rq_disk->disk_name,
 		req_data_dir(req) == READ ? "read" : "writ",
-		cyl, head, sec, nsect, req->buffer);
+		cyl, head, sec, nsect, bio_data(req->bio));
 #endif
 	if (req->cmd_type == REQ_TYPE_FS) {
 		switch (rq_data_dir(req)) {
@ -643,7 +643,7 @@ repeat:
 				bad_rw_intr();
 				goto repeat;
 			}
-			outsw(HD_DATA, req->buffer, 256);
+			outsw(HD_DATA, bio_data(req->bio), 256);
 			break;
 		default:
 			printk("unknown hd-command\n");
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@ -479,7 +479,7 @@ static unsigned int mg_out(struct mg_host *host,

 static void mg_read_one(struct mg_host *host, struct request *req)
 {
-	u16 *buff = (u16 *)req->buffer;
+	u16 *buff = (u16 *)bio_data(req->bio);
 	u32 i;

 	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
@ -496,7 +496,7 @@ static void mg_read(struct request *req)
 		mg_bad_rw_intr(host);

 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       blk_rq_sectors(req), blk_rq_pos(req), req->buffer);
+	       blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));

 	do {
 		if (mg_wait(host, ATA_DRQ,
@ -514,7 +514,7 @@ static void mg_read(struct request *req)

 static void mg_write_one(struct mg_host *host, struct request *req)
 {
-	u16 *buff = (u16 *)req->buffer;
+	u16 *buff = (u16 *)bio_data(req->bio);
 	u32 i;

 	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
@ -534,7 +534,7 @@ static void mg_write(struct request *req)
 	}

 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       rem, blk_rq_pos(req), req->buffer);
+	       rem, blk_rq_pos(req), bio_data(req->bio));

 	if (mg_wait(host, ATA_DRQ,
 		    MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
@ -585,7 +585,7 @@ ok_to_read:
 	mg_read_one(host, req);

 	MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-	       blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer);
+	       blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));

 	/* send read confirm */
 	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
@ -624,7 +624,7 @@ ok_to_write:
 		/* write 1 sector and set handler if remains */
 		mg_write_one(host, req);
 		MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-		       blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
+		       blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
 		host->mg_do_intr = mg_write_intr;
 		mod_timer(&host->timer, jiffies + 3 * HZ);
 	}
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@ -32,6 +32,7 @@ struct nullb {
 	unsigned int index;
 	struct request_queue *q;
 	struct gendisk *disk;
+	struct blk_mq_tag_set tag_set;
 	struct hrtimer timer;
 	unsigned int queue_depth;
 	spinlock_t lock;
@ -226,7 +227,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)

 static void null_softirq_done_fn(struct request *rq)
 {
-	end_cmd(rq->special);
+	end_cmd(blk_mq_rq_to_pdu(rq));
 }

 static inline void null_handle_cmd(struct nullb_cmd *cmd)
@ -311,7 +312,7 @@ static void null_request_fn(struct request_queue *q)

 static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
-	struct nullb_cmd *cmd = rq->special;
+	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);

 	cmd->rq = rq;
 	cmd->nq = hctx->driver_data;
@ -320,46 +321,6 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return BLK_MQ_RQ_QUEUE_OK;
 }

-static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index)
-{
-	int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes);
-	int tip = (reg->nr_hw_queues % nr_online_nodes);
-	int node = 0, i, n;
-
-	/*
-	 * Split submit queues evenly wrt to the number of nodes. If uneven,
-	 * fill the first buckets with one extra, until the rest is filled with
-	 * no extra.
-	 */
-	for (i = 0, n = 1; i < hctx_index; i++, n++) {
-		if (n % b_size == 0) {
-			n = 0;
-			node++;
-
-			tip--;
-			if (!tip)
-				b_size = reg->nr_hw_queues / nr_online_nodes;
-		}
-	}
-
-	/*
-	 * A node might not be online, therefore map the relative node id to the
-	 * real node id.
-	 */
-	for_each_online_node(n) {
-		if (!node)
-			break;
-		node--;
-	}
-
-	return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n);
-}
-
-static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index)
-{
-	kfree(hctx);
-}
-
 static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
 {
 	BUG_ON(!nullb);
@ -389,19 +350,14 @@ static struct blk_mq_ops null_mq_ops = {
 	.complete	= null_softirq_done_fn,
 };

-static struct blk_mq_reg null_mq_reg = {
-	.ops		= &null_mq_ops,
-	.queue_depth	= 64,
-	.cmd_size	= sizeof(struct nullb_cmd),
-	.flags		= BLK_MQ_F_SHOULD_MERGE,
-};
-
 static void null_del_dev(struct nullb *nullb)
 {
 	list_del_init(&nullb->list);

 	del_gendisk(nullb->disk);
 	blk_cleanup_queue(nullb->q);
+	if (queue_mode == NULL_Q_MQ)
+		blk_mq_free_tag_set(&nullb->tag_set);
 	put_disk(nullb->disk);
 	kfree(nullb);
 }
@ -506,7 +462,7 @@ static int null_add_dev(void)

 	nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
 	if (!nullb)
-		return -ENOMEM;
+		goto out;

 	spin_lock_init(&nullb->lock);

@ -514,49 +470,44 @@ static int null_add_dev(void)
 		submit_queues = nr_online_nodes;

 	if (setup_queues(nullb))
-		goto err;
+		goto out_free_nullb;

 	if (queue_mode == NULL_Q_MQ) {
-		null_mq_reg.numa_node = home_node;
-		null_mq_reg.queue_depth = hw_queue_depth;
-		null_mq_reg.nr_hw_queues = submit_queues;
+		nullb->tag_set.ops = &null_mq_ops;
+		nullb->tag_set.nr_hw_queues = submit_queues;
+		nullb->tag_set.queue_depth = hw_queue_depth;
+		nullb->tag_set.numa_node = home_node;
+		nullb->tag_set.cmd_size	= sizeof(struct nullb_cmd);
+		nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+		nullb->tag_set.driver_data = nullb;

-		if (use_per_node_hctx) {
-			null_mq_reg.ops->alloc_hctx = null_alloc_hctx;
-			null_mq_reg.ops->free_hctx = null_free_hctx;
-		} else {
-			null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue;
-			null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue;
-		}
+		if (blk_mq_alloc_tag_set(&nullb->tag_set))
+			goto out_cleanup_queues;

-		nullb->q = blk_mq_init_queue(&null_mq_reg, nullb);
+		nullb->q = blk_mq_init_queue(&nullb->tag_set);
+		if (!nullb->q)
+			goto out_cleanup_tags;
 	} else if (queue_mode == NULL_Q_BIO) {
 		nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+		if (!nullb->q)
+			goto out_cleanup_queues;
 		blk_queue_make_request(nullb->q, null_queue_bio);
 		init_driver_queues(nullb);
 	} else {
 		nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+		if (!nullb->q)
+			goto out_cleanup_queues;
 		blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
-		if (nullb->q)
-			blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+		blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
 		init_driver_queues(nullb);
 	}

-	if (!nullb->q)
-		goto queue_fail;
-
 	nullb->q->queuedata = nullb;
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);

 	disk = nullb->disk = alloc_disk_node(1, home_node);
-	if (!disk) {
-queue_fail:
-		blk_cleanup_queue(nullb->q);
-		cleanup_queues(nullb);
-err:
-		kfree(nullb);
-		return -ENOMEM;
-	}
+	if (!disk)
+		goto out_cleanup_blk_queue;

 	mutex_lock(&lock);
 	list_add_tail(&nullb->list, &nullb_list);
@ -579,6 +530,18 @@ err:
 	sprintf(disk->disk_name, "nullb%d", nullb->index);
 	add_disk(disk);
 	return 0;
+
+out_cleanup_blk_queue:
+	blk_cleanup_queue(nullb->q);
+out_cleanup_tags:
+	if (queue_mode == NULL_Q_MQ)
+		blk_mq_free_tag_set(&nullb->tag_set);
+out_cleanup_queues:
+	cleanup_queues(nullb);
+out_free_nullb:
+	kfree(nullb);
+out:
+	return -ENOMEM;
 }

 static int __init null_init(void)
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@ -747,7 +747,7 @@ static void do_pcd_request(struct request_queue * q)
 			pcd_current = cd;
 			pcd_sector = blk_rq_pos(pcd_req);
 			pcd_count = blk_rq_cur_sectors(pcd_req);
-			pcd_buf = pcd_req->buffer;
+			pcd_buf = bio_data(pcd_req->bio);
 			pcd_busy = 1;
 			ps_set_intr(do_pcd_read, NULL, 0, nice);
 			return;
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@ -454,7 +454,7 @@ static enum action do_pd_io_start(void)
 		if (pd_block + pd_count > get_capacity(pd_req->rq_disk))
 			return Fail;
 		pd_run = blk_rq_sectors(pd_req);
-		pd_buf = pd_req->buffer;
+		pd_buf = bio_data(pd_req->bio);
 		pd_retries = 0;
 		if (pd_cmd == READ)
 			return do_pd_read_start();
@ -485,7 +485,7 @@ static int pd_next_buf(void)
 	spin_lock_irqsave(&pd_lock, saved_flags);
 	__blk_end_request_cur(pd_req, 0);
 	pd_count = blk_rq_cur_sectors(pd_req);
-	pd_buf = pd_req->buffer;
+	pd_buf = bio_data(pd_req->bio);
 	spin_unlock_irqrestore(&pd_lock, saved_flags);
 	return 0;
 }
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@ -795,7 +795,7 @@ repeat:
 	}

 	pf_cmd = rq_data_dir(pf_req);
-	pf_buf = pf_req->buffer;
+	pf_buf = bio_data(pf_req->bio);
 	pf_retries = 0;

 	pf_busy = 1;
@ -827,7 +827,7 @@ static int pf_next_buf(void)
 		if (!pf_req)
 			return 1;
 		pf_count = blk_rq_cur_sectors(pf_req);
-		pf_buf = pf_req->buffer;
+		pf_buf = bio_data(pf_req->bio);
 	}
 	return 0;
 }
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@ -563,7 +563,6 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,

 	req = skreq->req;
 	blk_add_request_payload(req, page, len);
-	req->buffer = buf;
 }

 static void skd_request_fn_not_online(struct request_queue *q);
@ -744,6 +743,7 @@ static void skd_request_fn(struct request_queue *q)
 				break;
 			}
 			skreq->discard_page = 1;
+			req->completion_data = page;
 			skd_prep_discard_cdb(scsi_req, skreq, page, lba, count);

 		} else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) {
@ -858,8 +858,7 @@ static void skd_end_request(struct skd_device *skdev,
 		(skreq->discard_page == 1)) {
 		pr_debug("%s:%s:%d, free the page!",
 			 skdev->name, __func__, __LINE__);
-		free_page((unsigned long)req->buffer);
-		req->buffer = NULL;
+		__free_page(req->completion_data);
 	}

 	if (unlikely(error)) {
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@ -549,7 +549,7 @@ static void redo_fd_request(struct request_queue *q)
 		case READ:
 			err = floppy_read_sectors(fs, blk_rq_pos(req),
 						  blk_rq_cur_sectors(req),
-						  req->buffer);
+						  bio_data(req->bio));
 			break;
 		}
 	done:
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@ -342,7 +342,7 @@ static void start_request(struct floppy_state *fs)
 		swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
 			  req->rq_disk->disk_name, req->cmd,
 			  (long)blk_rq_pos(req), blk_rq_sectors(req),
-			  req->buffer);
+			  bio_data(req->bio));
 		swim3_dbg("           errors=%d current_nr_sectors=%u\n",
 			  req->errors, blk_rq_cur_sectors(req));
 #endif
@ -479,11 +479,11 @@ static inline void setup_transfer(struct floppy_state *fs)
 		/* Set up 3 dma commands: write preamble, data, postamble */
 		init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble));
 		++cp;
-		init_dma(cp, OUTPUT_MORE, req->buffer, 512);
+		init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512);
 		++cp;
 		init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble));
 	} else {
-		init_dma(cp, INPUT_LAST, req->buffer, n * 512);
+		init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512);
 	}
 	++cp;
 	out_le16(&cp->command, DBDMA_STOP);
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@ -30,6 +30,9 @@ struct virtio_blk
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;

+	/* Block layer tags. */
+	struct blk_mq_tag_set tag_set;
+
 	/* Process context for config space updates */
 	struct work_struct config_work;

@ -112,7 +115,7 @@ static int __virtblk_add_req(struct virtqueue *vq,

 static inline void virtblk_request_done(struct request *req)
 {
-	struct virtblk_req *vbr = req->special;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 	int error = virtblk_result(vbr);

 	if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
@ -147,14 +150,14 @@ static void virtblk_done(struct virtqueue *vq)

 	/* In case queue is stopped waiting for more buffers. */
 	if (req_done)
-		blk_mq_start_stopped_hw_queues(vblk->disk->queue);
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
 	spin_unlock_irqrestore(&vblk->vq_lock, flags);
 }

 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 {
 	struct virtio_blk *vblk = hctx->queue->queuedata;
-	struct virtblk_req *vbr = req->special;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 	unsigned long flags;
 	unsigned int num;
 	const bool last = (req->cmd_flags & REQ_END) != 0;
@ -480,33 +483,27 @@ static const struct device_attribute dev_attr_cache_type_rw =
 	__ATTR(cache_type, S_IRUGO|S_IWUSR,
 	       virtblk_cache_type_show, virtblk_cache_type_store);

-static struct blk_mq_ops virtio_mq_ops = {
-	.queue_rq	= virtio_queue_rq,
-	.map_queue	= blk_mq_map_queue,
-	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
-	.free_hctx	= blk_mq_free_single_hw_queue,
-	.complete	= virtblk_request_done,
-};
-
-static struct blk_mq_reg virtio_mq_reg = {
-	.ops		= &virtio_mq_ops,
-	.nr_hw_queues	= 1,
-	.queue_depth	= 0, /* Set in virtblk_probe */
-	.numa_node	= NUMA_NO_NODE,
-	.flags		= BLK_MQ_F_SHOULD_MERGE,
-};
-module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444);
-
-static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
-			     struct request *rq, unsigned int nr)
+static int virtblk_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
 {
 	struct virtio_blk *vblk = data;
-	struct virtblk_req *vbr = rq->special;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);

 	sg_init_table(vbr->sg, vblk->sg_elems);
 	return 0;
 }

+static struct blk_mq_ops virtio_mq_ops = {
+	.queue_rq	= virtio_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.complete	= virtblk_request_done,
+	.init_request	= virtblk_init_request,
+};
+
+static unsigned int virtblk_queue_depth;
+module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
+
 static int virtblk_probe(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk;
@ -561,24 +558,34 @@ static int virtblk_probe(struct virtio_device *vdev)
 	}

 	/* Default queue sizing is to fill the ring. */
-	if (!virtio_mq_reg.queue_depth) {
-		virtio_mq_reg.queue_depth = vblk->vq->num_free;
+	if (!virtblk_queue_depth) {
+		virtblk_queue_depth = vblk->vq->num_free;
 		/* ... but without indirect descs, we use 2 descs per req */
 		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
-			virtio_mq_reg.queue_depth /= 2;
+			virtblk_queue_depth /= 2;
 	}
-	virtio_mq_reg.cmd_size =
+
+	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
+	vblk->tag_set.ops = &virtio_mq_ops;
+	vblk->tag_set.nr_hw_queues = 1;
+	vblk->tag_set.queue_depth = virtblk_queue_depth;
+	vblk->tag_set.numa_node = NUMA_NO_NODE;
+	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	vblk->tag_set.cmd_size =
 		sizeof(struct virtblk_req) +
 		sizeof(struct scatterlist) * sg_elems;
+	vblk->tag_set.driver_data = vblk;

-	q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk);
+	err = blk_mq_alloc_tag_set(&vblk->tag_set);
+	if (err)
+		goto out_put_disk;
+
+	q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
 	if (!q) {
 		err = -ENOMEM;
-		goto out_put_disk;
+		goto out_free_tags;
 	}

-	blk_mq_init_commands(q, virtblk_init_vbr, vblk);
-
 	q->queuedata = vblk;

 	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@ -679,6 +686,8 @@ static int virtblk_probe(struct virtio_device *vdev)
 out_del_disk:
 	del_gendisk(vblk->disk);
 	blk_cleanup_queue(vblk->disk->queue);
+out_free_tags:
+	blk_mq_free_tag_set(&vblk->tag_set);
 out_put_disk:
 	put_disk(vblk->disk);
 out_free_vq:
@ -705,6 +714,8 @@ static void virtblk_remove(struct virtio_device *vdev)
 	del_gendisk(vblk->disk);
 	blk_cleanup_queue(vblk->disk->queue);

+	blk_mq_free_tag_set(&vblk->tag_set);
+
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);

@ -749,7 +760,7 @@ static int virtblk_restore(struct virtio_device *vdev)
 	vblk->config_enable = true;
 	ret = init_vq(vdev->priv);
 	if (!ret)
-		blk_mq_start_stopped_hw_queues(vblk->disk->queue);
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);

 	return ret;
 }
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@ -612,10 +612,10 @@ static void do_blkif_request(struct request_queue *rq)
 		}

 		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
-			 "(%u/%u) buffer:%p [%s]\n",
+			 "(%u/%u) [%s]\n",
 			 req, req->cmd, (unsigned long)blk_rq_pos(req),
 			 blk_rq_cur_sectors(req), blk_rq_sectors(req),
-			 req->buffer, rq_data_dir(req) ? "write" : "read");
+			 rq_data_dir(req) ? "write" : "read");

 		if (blkif_queue_request(req)) {
 			blk_requeue_request(rq, req);
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@ -661,7 +661,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			rq_data_dir(req));

 		ace->req = req;
-		ace->data_ptr = req->buffer;
+		ace->data_ptr = bio_data(req->bio);
 		ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR;
 		ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF);

@ -733,7 +733,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			 *      blk_rq_sectors(ace->req),
 			 *      blk_rq_cur_sectors(ace->req));
 			 */
-			ace->data_ptr = ace->req->buffer;
+			ace->data_ptr = bio_data(ace->req->bio);
 			ace->data_count = blk_rq_cur_sectors(ace->req) * 16;
 			ace_fsm_yieldirq(ace);
 			break;
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@ -87,13 +87,15 @@ static void do_z2_request(struct request_queue *q)
 		while (len) {
 			unsigned long addr = start & Z2RAM_CHUNKMASK;
 			unsigned long size = Z2RAM_CHUNKSIZE - addr;
+			void *buffer = bio_data(req->bio);
+
 			if (len < size)
 				size = len;
 			addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ];
 			if (rq_data_dir(req) == READ)
-				memcpy(req->buffer, (char *)addr, size);
+				memcpy(buffer, (char *)addr, size);
 			else
-				memcpy((char *)addr, req->buffer, size);
+				memcpy((char *)addr, buffer, size);
 			start += size;
 			len -= size;
 		}
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@ -602,7 +602,7 @@ static void gdrom_readdisk_dma(struct work_struct *work)
 		spin_unlock(&gdrom_lock);
 		block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET;
 		block_cnt = blk_rq_sectors(req)/GD_TO_BLK;
-		__raw_writel(virt_to_phys(req->buffer), GDROM_DMA_STARTADDR_REG);
+		__raw_writel(virt_to_phys(bio_data(req->bio)), GDROM_DMA_STARTADDR_REG);
 		__raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG);
 		__raw_writel(1, GDROM_DMA_DIRECTION_REG);
 		__raw_writel(1, GDROM_DMA_ENABLE_REG);
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@ -902,6 +902,7 @@ void add_disk_randomness(struct gendisk *disk)
 	add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
 	trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool));
 }
+EXPORT_SYMBOL_GPL(add_disk_randomness);
 #endif

 /*********************************************************************
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@ -188,10 +188,9 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,

 	ledtrig_ide_activity();

-	pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n",
+	pr_debug("%s: %sing: block=%llu, sectors=%u\n",
 		 drive->name, rq_data_dir(rq) == READ ? "read" : "writ",
-		 (unsigned long long)block, blk_rq_sectors(rq),
-		 (unsigned long)rq->buffer);
+		 (unsigned long long)block, blk_rq_sectors(rq));

 	if (hwif->rw_disk)
 		hwif->rw_disk(drive, rq);
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@ -1544,7 +1544,6 @@ static int setup_clone(struct request *clone, struct request *rq,
 	clone->cmd = rq->cmd;
 	clone->cmd_len = rq->cmd_len;
 	clone->sense = rq->sense;
-	clone->buffer = rq->buffer;
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;

--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@ -82,8 +82,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,

 	block = blk_rq_pos(req) << 9 >> tr->blkshift;
 	nsect = blk_rq_cur_bytes(req) >> tr->blkshift;
-
-	buf = req->buffer;
+	buf = bio_data(req->bio);

 	if (req->cmd_type != REQ_TYPE_FS)
 		return -EIO;
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@ -253,7 +253,7 @@ static int do_ubiblock_request(struct ubiblock *dev, struct request *req)
 	 * flash access anyway.
 	 */
 	mutex_lock(&dev->dev_mutex);
-	ret = ubiblock_read(dev, req->buffer, sec, len);
+	ret = ubiblock_read(dev, bio_data(req->bio), sec, len);
 	mutex_unlock(&dev->dev_mutex);

 	return ret;
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@ -207,7 +207,7 @@ static void jsfd_do_request(struct request_queue *q)
 			goto end;
 		}

-		jsfd_read(req->buffer, jdp->dbase + offset, len);
+		jsfd_read(bio_data(req->bio), jdp->dbase + offset, len);
 		err = 0;
 	end:
 		if (!__blk_end_request_cur(req, err))
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@ -140,7 +140,7 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
 	cmd->result = 0;
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_requeue_request(q, cmd->request);
-	kblockd_schedule_work(q, &device->requeue_work);
+	kblockd_schedule_work(&device->requeue_work);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }

@ -1019,8 +1019,6 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 		return BLKPREP_DEFER;
 	}

-	req->buffer = NULL;
-
 	/* 
 	 * Next, walk the list, and fill in the addresses and sizes of
 	 * each segment.
@ -1158,7 +1156,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 		BUG_ON(blk_rq_bytes(req));

 		memset(&cmd->sdb, 0, sizeof(cmd->sdb));
-		req->buffer = NULL;
 	}

 	cmd->cmd_len = req->cmd_len;
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@ -737,16 +737,14 @@ static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq)
 		goto out;
 	}

+	rq->completion_data = page;
 	blk_add_request_payload(rq, page, len);
 	ret = scsi_setup_blk_pc_cmnd(sdp, rq);
-	rq->buffer = page_address(page);
 	rq->__data_len = nr_bytes;

 out:
-	if (ret != BLKPREP_OK) {
+	if (ret != BLKPREP_OK)
 		__free_page(page);
-		rq->buffer = NULL;
-	}
 	return ret;
 }

@ -842,10 +840,9 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq)
 {
 	struct scsi_cmnd *SCpnt = rq->special;

-	if (rq->cmd_flags & REQ_DISCARD) {
-		free_page((unsigned long)rq->buffer);
-		rq->buffer = NULL;
-	}
+	if (rq->cmd_flags & REQ_DISCARD)
+		__free_page(rq->completion_data);
+
 	if (SCpnt->cmnd != rq->cmd) {
 		mempool_free(SCpnt->cmnd, sd_cdb_pool);
 		SCpnt->cmnd = NULL;
--- a/fs/Makefile
+++ b/fs/Makefile
@ -14,14 +14,13 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		stack.o fs_struct.o statfs.o

 ifeq ($(CONFIG_BLOCK),y)
-obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
 else
 obj-y +=	no-block.o
 endif

 obj-$(CONFIG_PROC_FS) += proc_namespace.o

-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-y				+= notify/
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@ -333,7 +333,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,

 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
-extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries);
+extern mempool_t *biovec_create_pool(int pool_entries);

 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@ -8,7 +8,13 @@ struct blk_mq_tags;
 struct blk_mq_cpu_notifier {
 	struct list_head list;
 	void *data;
-	void (*notify)(void *data, unsigned long action, unsigned int cpu);
+	int (*notify)(void *data, unsigned long action, unsigned int cpu);
+};
+
+struct blk_mq_ctxmap {
+	unsigned int map_size;
+	unsigned int bits_per_word;
+	struct blk_align_bitmap *map;
 };

 struct blk_mq_hw_ctx {
@ -18,7 +24,11 @@ struct blk_mq_hw_ctx {
 	} ____cacheline_aligned_in_smp;

 	unsigned long		state;		/* BLK_MQ_S_* flags */
-	struct delayed_work	delayed_work;
+	struct delayed_work	run_work;
+	struct delayed_work	delay_work;
+	cpumask_var_t		cpumask;
+	int			next_cpu;
+	int			next_cpu_batch;

 	unsigned long		flags;		/* BLK_MQ_F_* flags */

@ -27,13 +37,13 @@ struct blk_mq_hw_ctx {

 	void			*driver_data;

+	struct blk_mq_ctxmap	ctx_map;
+
 	unsigned int		nr_ctx;
 	struct blk_mq_ctx	**ctxs;
-	unsigned int 		nr_ctx_map;
-	unsigned long		*ctx_map;

-	struct request		**rqs;
-	struct list_head	page_list;
+	unsigned int		wait_index;
+
 	struct blk_mq_tags	*tags;

 	unsigned long		queued;
@ -41,31 +51,40 @@ struct blk_mq_hw_ctx {
 #define BLK_MQ_MAX_DISPATCH_ORDER	10
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];

-	unsigned int		queue_depth;
 	unsigned int		numa_node;
 	unsigned int		cmd_size;	/* per-request extra data */

+	atomic_t		nr_active;
+
 	struct blk_mq_cpu_notifier	cpu_notifier;
 	struct kobject		kobj;
 };

-struct blk_mq_reg {
+struct blk_mq_tag_set {
 	struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;
-	unsigned int		queue_depth;
+	unsigned int		queue_depth;	/* max hw supported */
 	unsigned int		reserved_tags;
 	unsigned int		cmd_size;	/* per-request extra data */
 	int			numa_node;
 	unsigned int		timeout;
 	unsigned int		flags;		/* BLK_MQ_F_* */
+	void			*driver_data;
+
+	struct blk_mq_tags	**tags;
+
+	struct mutex		tag_list_lock;
+	struct list_head	tag_list;
 };

 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
-typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
-typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (init_request_fn)(void *, struct request *, unsigned int,
+		unsigned int, unsigned int);
+typedef void (exit_request_fn)(void *, struct request *, unsigned int,
+		unsigned int);

 struct blk_mq_ops {
 	/*
@ -85,12 +104,6 @@ struct blk_mq_ops {

 	softirq_done_fn		*complete;

-	/*
-	 * Override for hctx allocations (should probably go)
-	 */
-	alloc_hctx_fn		*alloc_hctx;
-	free_hctx_fn		*free_hctx;
-
 	/*
 	 * Called when the block layer side of a hardware queue has been
 	 * set up, allowing the driver to allocate/init matching structures.
@ -98,6 +111,14 @@ struct blk_mq_ops {
 	 */
 	init_hctx_fn		*init_hctx;
 	exit_hctx_fn		*exit_hctx;
+
+	/*
+	 * Called for every command allocated by the block layer to allow
+	 * the driver to set up driver specific data.
+	 * Ditto for exit/teardown.
+	 */
+	init_request_fn		*init_request;
+	exit_request_fn		*exit_request;
 };

 enum {
@ -107,18 +128,24 @@ enum {

 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_SHOULD_SORT	= 1 << 1,
-	BLK_MQ_F_SHOULD_IPI	= 1 << 2,
+	BLK_MQ_F_TAG_SHARED	= 1 << 2,
+	BLK_MQ_F_SG_MERGE	= 1 << 3,
+	BLK_MQ_F_SYSFS_UP	= 1 << 4,

 	BLK_MQ_S_STOPPED	= 0,
+	BLK_MQ_S_TAG_ACTIVE	= 1,

 	BLK_MQ_MAX_DEPTH	= 2048,
+
+	BLK_MQ_CPU_WORK_BATCH	= 8,
 };

-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 int blk_mq_register_disk(struct gendisk *);
 void blk_mq_unregister_disk(struct gendisk *);
-int blk_mq_init_commands(struct request_queue *, int (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
-void blk_mq_free_commands(struct request_queue *, void (*free)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
+
+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set);

 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);

@ -126,28 +153,28 @@ void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
-struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
-struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+		gfp_t gfp, bool reserved);
+struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag);

 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int);
-void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);

-bool blk_mq_end_io_partial(struct request *rq, int error,
-		unsigned int nr_bytes);
-static inline void blk_mq_end_io(struct request *rq, int error)
-{
-	bool done = !blk_mq_end_io_partial(rq, error, blk_rq_bytes(rq));
-	BUG_ON(!done);
-}
+void blk_mq_end_io(struct request *rq, int error);
+void __blk_mq_end_io(struct request *rq, int error);

+void blk_mq_requeue_request(struct request *rq);
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
+void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_complete_request(struct request *rq);

 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
-void blk_mq_start_stopped_hw_queues(struct request_queue *q);
+void blk_mq_start_hw_queues(struct request_queue *q);
+void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
+void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);

 /*
 * Driver command data is immediately after the request. So subtract request
@ -162,12 +189,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 	return (void *) rq + sizeof(*rq);
 }

-static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
-					       unsigned int tag)
-{
-	return hctx->rqs[tag];
-}
-
 #define queue_for_each_hw_ctx(q, hctx, i)				\
 	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
 	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@ -190,6 +190,7 @@ enum rq_flag_bits {
 	__REQ_PM,		/* runtime pm request */
 	__REQ_END,		/* last of chain of requests */
 	__REQ_HASHED,		/* on IO scheduler merge hash */
+	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
 	__REQ_NR_BITS,		/* stops here */
 };

@ -243,5 +244,6 @@ enum rq_flag_bits {
 #define REQ_PM			(1ULL << __REQ_PM)
 #define REQ_END			(1ULL << __REQ_END)
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
+#define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)

 #endif /* __LINUX_BLK_TYPES_H */
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@ -90,15 +90,15 @@ enum rq_cmd_type_bits {
 #define BLK_MAX_CDB	16

 /*
- * try to put the fields that are referenced together in the same cacheline.
- * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init()
- * as well!
+ * Try to put the fields that are referenced together in the same cacheline.
+ *
+ * If you modify this structure, make sure to update blk_rq_init() and
+ * especially blk_mq_rq_ctx_init() to take care of the added fields.
 */
 struct request {
 	struct list_head queuelist;
 	union {
 		struct call_single_data csd;
-		struct work_struct mq_flush_work;
 		unsigned long fifo_time;
 	};

@ -178,7 +178,6 @@ struct request {
 	unsigned short ioprio;

 	void *special;		/* opaque pointer available for LLD use */
-	char *buffer;		/* kaddr of the current segment if available */

 	int tag;
 	int errors;
@ -463,6 +462,10 @@ struct request_queue {
 	struct request		*flush_rq;
 	spinlock_t		mq_flush_lock;

+	struct list_head	requeue_list;
+	spinlock_t		requeue_lock;
+	struct work_struct	requeue_work;
+
 	struct mutex		sysfs_lock;

 	int			bypass_depth;
@ -481,6 +484,9 @@ struct request_queue {
 	wait_queue_head_t	mq_freeze_wq;
 	struct percpu_counter	mq_usage_counter;
 	struct list_head	all_q_node;
+
+	struct blk_mq_tag_set	*tag_set;
+	struct list_head	tag_set_list;
 };

 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
@ -504,6 +510,7 @@ struct request_queue {
 #define QUEUE_FLAG_SAME_FORCE  18	/* force complete on same CPU */
 #define QUEUE_FLAG_DEAD        19	/* queue tear-down finished */
 #define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
+#define QUEUE_FLAG_NO_SG_MERGE 21	/* don't attempt to merge SG segments*/

 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
@ -937,6 +944,7 @@ extern struct request *blk_fetch_request(struct request_queue *q);
 */
 extern bool blk_update_request(struct request *rq, int error,
 			       unsigned int nr_bytes);
+extern void blk_finish_request(struct request *rq, int error);
 extern bool blk_end_request(struct request *rq, int error,
 			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
@ -1053,7 +1061,6 @@ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
 * schedule() where blk_schedule_flush_plug() is called.
 */
 struct blk_plug {
-	unsigned long magic; /* detect uninitialized use-cases */
 	struct list_head list; /* requests */
 	struct list_head mq_list; /* blk-mq requests */
 	struct list_head cb_list; /* md requires an unplug callback */
@ -1102,7 +1109,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 /*
 * tag stuff
 */
-#define blk_rq_tagged(rq)		((rq)->cmd_flags & REQ_QUEUED)
+#define blk_rq_tagged(rq) \
+	((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED))
 extern int blk_queue_start_tag(struct request_queue *, struct request *);
 extern struct request *blk_queue_find_tag(struct request_queue *, int);
 extern void blk_queue_end_tag(struct request_queue *, struct request *);
@ -1370,8 +1378,9 @@ static inline void put_dev_sector(Sector p)
 }

 struct work_struct;
-int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
-int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
+int kblockd_schedule_work(struct work_struct *work);
+int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
+int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);

 #ifdef CONFIG_BLK_CGROUP
 /*
--- a/mm/Makefile
+++ b/mm/Makefile
@ -30,7 +30,6 @@ endif

 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o

-obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
 obj-$(CONFIG_ZSWAP)	+= zswap.o