Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe: "A collection of fixes since the merge window; - fix for a double elevator module release, from Chao Yu. Ancient bug. - the splice() MORE flag fix from Christophe Leroy. - a fix for NVMe, fixing a patch that went in in the merge window. From Keith. - two fixes for blk-mq CPU hotplug handling, from Ming Lei. - bdi vs blockdev lifetime fix from Neil Brown, fixing and oops in md. - two blk-mq fixes from Shaohua, fixing a race on queue stop and a bad merge issue with FUA writes. - division-by-zero fix for writeback from Tejun. - a block bounce page accounting fix, making sure we inc/dec after bouncing so that pre/post IO pages match up. From Wang YanQing" * 'for-linus' of git://git.kernel.dk/linux-block: splice: sendfile() at once fails for big files blk-mq: don't lose requests if a stopped queue restarts blk-mq: fix FUA request hang block: destroy bdi before blockdev is unregistered. block:bounce: fix call inc_|dec_zone_page_state on different pages confuse value of NR_BOUNCE elevator: fix double release of elevator module writeback: use |1 instead of +1 to protect against div by zero blk-mq: fix CPU hotplug handling blk-mq: fix race between timeout and CPU hotplug NVMe: Fix VPD B0 max sectors translation
2015-05-08 19:49:35 -07:00 · 2015-05-08 19:49:35 -07:00 · 1daac193f2
commit 1daac193f2
parent 41c64bb19c 0ff28d9f46
11 changed files with 60 additions and 41 deletions
--- a/block/blk-core.c
+++ b/block/blk-core.c
@ -552,6 +552,8 @@ void blk_cleanup_queue(struct request_queue *q)
 		q->queue_lock = &q->__queue_lock;
 	spin_unlock_irq(lock);

+	bdi_destroy(&q->backing_dev_info);
+
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
 }
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@ -677,8 +677,11 @@ static void blk_mq_rq_timer(unsigned long priv)
 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
 		mod_timer(&q->timeout, data.next);
 	} else {
-		queue_for_each_hw_ctx(q, hctx, i)
-			blk_mq_tag_idle(hctx);
+		queue_for_each_hw_ctx(q, hctx, i) {
+			/* the hctx may be unmapped, so check it here */
+			if (blk_mq_hw_queue_mapped(hctx))
+				blk_mq_tag_idle(hctx);
+		}
 	}
 }

@ -855,6 +858,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		spin_lock(&hctx->lock);
 		list_splice(&rq_list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
+		/*
+		 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
+		 * it's possible the queue is stopped and restarted again
+		 * before this. Queue restart will dispatch requests. And since
+		 * requests in rq_list aren't added into hctx->dispatch yet,
+		 * the requests in rq_list might get lost.
+		 *
+		 * blk_mq_run_hw_queue() already checks the STOPPED bit
+		 **/
+		blk_mq_run_hw_queue(hctx, true);
 	}
 }

@ -1571,22 +1584,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
 	return NOTIFY_OK;
 }

-static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
-{
-	struct request_queue *q = hctx->queue;
-	struct blk_mq_tag_set *set = q->tag_set;
-
-	if (set->tags[hctx->queue_num])
-		return NOTIFY_OK;
-
-	set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
-	if (!set->tags[hctx->queue_num])
-		return NOTIFY_STOP;
-
-	hctx->tags = set->tags[hctx->queue_num];
-	return NOTIFY_OK;
-}
-
 static int blk_mq_hctx_notify(void *data, unsigned long action,
 			      unsigned int cpu)
 {
@ -1594,8 +1591,11 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,

 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
 		return blk_mq_hctx_cpu_offline(hctx, cpu);
-	else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
-		return blk_mq_hctx_cpu_online(hctx, cpu);
+
+	/*
+	 * In case of CPU online, tags may be reallocated
+	 * in blk_mq_map_swqueue() after mapping is updated.
+	 */

 	return NOTIFY_OK;
 }
@ -1775,6 +1775,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	unsigned int i;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
+	struct blk_mq_tag_set *set = q->tag_set;

 	queue_for_each_hw_ctx(q, hctx, i) {
 		cpumask_clear(hctx->cpumask);
@ -1803,16 +1804,20 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		 * disable it and free the request entries.
 		 */
 		if (!hctx->nr_ctx) {
-			struct blk_mq_tag_set *set = q->tag_set;
-
 			if (set->tags[i]) {
 				blk_mq_free_rq_map(set, set->tags[i], i);
 				set->tags[i] = NULL;
-				hctx->tags = NULL;
 			}
+			hctx->tags = NULL;
 			continue;
 		}

+		/* unmapped hw queue can be remapped after CPU topo changed */
+		if (!set->tags[i])
+			set->tags[i] = blk_mq_init_rq_map(set, i);
+		hctx->tags = set->tags[i];
+		WARN_ON(!hctx->tags);
+
 		/*
 		 * Set the map size to the number of mapped software queues.
 		 * This is more accurate and more efficient than looping
@ -2090,9 +2095,16 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	 */
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_freeze_queue_start(q);
-	list_for_each_entry(q, &all_q_list, all_q_node)
+	list_for_each_entry(q, &all_q_list, all_q_node) {
 		blk_mq_freeze_queue_wait(q);

+		/*
+		 * timeout handler can't touch hw queue during the
+		 * reinitialization
+		 */
+		del_timer_sync(&q->timeout);
+	}
+
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_queue_reinit(q);

--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@ -522,8 +522,6 @@ static void blk_release_queue(struct kobject *kobj)

 	blk_trace_shutdown(q);

-	bdi_destroy(&q->backing_dev_info);
-
 	ida_simple_remove(&blk_queue_ida, q->id);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
--- a/block/bounce.c
+++ b/block/bounce.c
@ -221,8 +221,8 @@ bounce:
 		if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
 			continue;

-		inc_zone_page_state(to->bv_page, NR_BOUNCE);
 		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+		inc_zone_page_state(to->bv_page, NR_BOUNCE);

 		if (rw == WRITE) {
 			char *vto, *vfrom;
--- a/block/elevator.c
+++ b/block/elevator.c
@ -157,7 +157,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,

 	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
 	if (unlikely(!eq))
-		goto err;
+		return NULL;

 	eq->type = e;
 	kobject_init(&eq->kobj, &elv_ktype);
@ -165,10 +165,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
 	hash_init(eq->hash);

 	return eq;
-err:
-	kfree(eq);
-	elevator_put(e);
-	return NULL;
 }
 EXPORT_SYMBOL(elevator_alloc);

--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@ -1620,8 +1620,8 @@ out:

 static void loop_remove(struct loop_device *lo)
 {
-	del_gendisk(lo->lo_disk);
 	blk_cleanup_queue(lo->lo_queue);
+	del_gendisk(lo->lo_disk);
 	blk_mq_free_tag_set(&lo->tag_set);
 	put_disk(lo->lo_disk);
 	kfree(lo);
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@ -944,7 +944,8 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 static int nvme_trans_bdev_limits_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					u8 *inq_response, int alloc_len)
 {
-	__be32 max_sectors = cpu_to_be32(queue_max_hw_sectors(ns->queue));
+	__be32 max_sectors = cpu_to_be32(
+		nvme_block_nr(ns, queue_max_hw_sectors(ns->queue)));
 	__be32 max_discard = cpu_to_be32(ns->queue->limits.max_discard_sectors);
 	__be32 discard_desc_count = cpu_to_be32(0x100);

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@ -4818,12 +4818,12 @@ static void md_free(struct kobject *ko)
 	if (mddev->sysfs_state)
 		sysfs_put(mddev->sysfs_state);

+	if (mddev->queue)
+		blk_cleanup_queue(mddev->queue);
 	if (mddev->gendisk) {
 		del_gendisk(mddev->gendisk);
 		put_disk(mddev->gendisk);
 	}
-	if (mddev->queue)
-		blk_cleanup_queue(mddev->queue);

 	kfree(mddev);
 }
--- a/fs/splice.c
+++ b/fs/splice.c
@ -1161,7 +1161,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	long ret, bytes;
 	umode_t i_mode;
 	size_t len;
-	int i, flags;
+	int i, flags, more;

 	/*
 	 * We require the input being a regular file, as we don't want to
@ -1204,6 +1204,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	 * Don't block on output, we have to drain the direct pipe.
 	 */
 	sd->flags &= ~SPLICE_F_NONBLOCK;
+	more = sd->flags & SPLICE_F_MORE;

 	while (len) {
 		size_t read_len;
@ -1216,6 +1217,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 		read_len = ret;
 		sd->total_len = read_len;

+		/*
+		 * If more data is pending, set SPLICE_F_MORE
+		 * If this is the last data and SPLICE_F_MORE was not set
+		 * initially, clears it.
+		 */
+		if (read_len < len)
+			sd->flags |= SPLICE_F_MORE;
+		else if (!more)
+			sd->flags &= ~SPLICE_F_MORE;
 		/*
 		 * NOTE: nonblocking mode only applies to the input. We
 		 * must not do the output in nonblocking mode as then we
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@ -220,7 +220,7 @@ enum rq_flag_bits {

 /* This mask is used for both bio and request merge checking */
 #define REQ_NOMERGE_FLAGS \
-	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
+	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_FLUSH_SEQ)

 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_THROTTLED		(1ULL << __REQ_THROTTLED)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@ -580,7 +580,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
 	long x;

 	x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
-		    limit - setpoint + 1);
+		      (limit - setpoint) | 1);
 	pos_ratio = x;
 	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
 	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@ -807,7 +807,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 	 * scale global setpoint to bdi's:
 	 *	bdi_setpoint = setpoint * bdi_thresh / thresh
 	 */
-	x = div_u64((u64)bdi_thresh << 16, thresh + 1);
+	x = div_u64((u64)bdi_thresh << 16, thresh | 1);
 	bdi_setpoint = setpoint * (u64)x >> 16;
 	/*
 	 * Use span=(8*write_bw) in single bdi case as indicated by
@ -822,7 +822,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,

 	if (bdi_dirty < x_intercept - span / 4) {
 		pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
-				    x_intercept - bdi_setpoint + 1);
+				      (x_intercept - bdi_setpoint) | 1);
 	} else
 		pos_ratio /= 4;