From ac1f9ef211d5dd70110fa4bec6e8866b2c3a965e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 12 Feb 2015 15:20:35 -0500
Subject: [PATCH 01/34] dm log userspace: split flush_entry_pool to be per
 dirty-log

Use a single slab cache to allocate a mempool for each dirty-log.
This _should_ eliminate DM's need for io_schedule_timeout() in
mempool_alloc(); so io_schedule() should be sufficient now.

Also, rename struct flush_entry to dm_dirty_log_flush_entry to allow
KMEM_CACHE() to create a meaningful global name for the slab cache.

Also, eliminate some holes in struct log_c by rearranging members.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Heinz Mauelshagen <heinzm@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 84 ++++++++++++++++--------------
 1 file changed, 45 insertions(+), 39 deletions(-)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 03177ca0b009..39fa00733431 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -17,7 +17,9 @@
 
 #define DM_LOG_USERSPACE_VSN "1.3.0"
 
-struct flush_entry {
+#define FLUSH_ENTRY_POOL_SIZE 16
+
+struct dm_dirty_log_flush_entry {
 	int type;
 	region_t region;
 	struct list_head list;
@@ -34,22 +36,14 @@ struct flush_entry {
 struct log_c {
 	struct dm_target *ti;
 	struct dm_dev *log_dev;
-	uint32_t region_size;
-	region_t region_count;
-	uint64_t luid;
-	char uuid[DM_UUID_LEN];
 
 	char *usr_argv_str;
 	uint32_t usr_argc;
 
-	/*
-	 * in_sync_hint gets set when doing is_remote_recovering.  It
-	 * represents the first region that needs recovery.  IOW, the
-	 * first zero bit of sync_bits.  This can be useful for to limit
-	 * traffic for calls like is_remote_recovering and get_resync_work,
-	 * but be take care in its use for anything else.
-	 */
-	uint64_t in_sync_hint;
+	uint32_t region_size;
+	region_t region_count;
+	uint64_t luid;
+	char uuid[DM_UUID_LEN];
 
 	/*
 	 * Mark and clear requests are held until a flush is issued
@@ -61,6 +55,15 @@ struct log_c {
 	struct list_head mark_list;
 	struct list_head clear_list;
 
+	/*
+	 * in_sync_hint gets set when doing is_remote_recovering.  It
+	 * represents the first region that needs recovery.  IOW, the
+	 * first zero bit of sync_bits.  This can be useful for to limit
+	 * traffic for calls like is_remote_recovering and get_resync_work,
+	 * but be take care in its use for anything else.
+	 */
+	uint64_t in_sync_hint;
+
 	/*
 	 * Workqueue for flush of clear region requests.
 	 */
@@ -72,19 +75,11 @@ struct log_c {
 	 * Combine userspace flush and mark requests for efficiency.
 	 */
 	uint32_t integrated_flush;
+
+	mempool_t *flush_entry_pool;
 };
 
-static mempool_t *flush_entry_pool;
-
-static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
-{
-	return kmalloc(sizeof(struct flush_entry), gfp_mask);
-}
-
-static void flush_entry_free(void *element, void *pool_data)
-{
-	kfree(element);
-}
+static struct kmem_cache *_flush_entry_cache;
 
 static int userspace_do_request(struct log_c *lc, const char *uuid,
 				int request_type, char *data, size_t data_size,
@@ -254,6 +249,14 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 		goto out;
 	}
 
+	lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
+							_flush_entry_cache);
+	if (!lc->flush_entry_pool) {
+		DMERR("Failed to create flush_entry_pool");
+		r = -ENOMEM;
+		goto out;
+	}
+
 	/*
 	 * Send table string and get back any opened device.
 	 */
@@ -310,6 +313,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 out:
 	kfree(devices_rdata);
 	if (r) {
+		if (lc->flush_entry_pool)
+			mempool_destroy(lc->flush_entry_pool);
 		kfree(lc);
 		kfree(ctr_str);
 	} else {
@@ -338,6 +343,8 @@ static void userspace_dtr(struct dm_dirty_log *log)
 	if (lc->log_dev)
 		dm_put_device(lc->ti, lc->log_dev);
 
+	mempool_destroy(lc->flush_entry_pool);
+
 	kfree(lc->usr_argv_str);
 	kfree(lc);
 
@@ -461,7 +468,7 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
 static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
 {
 	int r = 0;
-	struct flush_entry *fe;
+	struct dm_dirty_log_flush_entry *fe;
 
 	list_for_each_entry(fe, flush_list, list) {
 		r = userspace_do_request(lc, lc->uuid, fe->type,
@@ -481,7 +488,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
 	int r = 0;
 	int count;
 	uint32_t type = 0;
-	struct flush_entry *fe, *tmp_fe;
+	struct dm_dirty_log_flush_entry *fe, *tmp_fe;
 	LIST_HEAD(tmp_list);
 	uint64_t group[MAX_FLUSH_GROUP_COUNT];
 
@@ -563,7 +570,8 @@ static int userspace_flush(struct dm_dirty_log *log)
 	LIST_HEAD(clear_list);
 	int mark_list_is_empty;
 	int clear_list_is_empty;
-	struct flush_entry *fe, *tmp_fe;
+	struct dm_dirty_log_flush_entry *fe, *tmp_fe;
+	mempool_t *flush_entry_pool = lc->flush_entry_pool;
 
 	spin_lock_irqsave(&lc->flush_lock, flags);
 	list_splice_init(&lc->mark_list, &mark_list);
@@ -643,10 +651,10 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
 {
 	unsigned long flags;
 	struct log_c *lc = log->context;
-	struct flush_entry *fe;
+	struct dm_dirty_log_flush_entry *fe;
 
 	/* Wait for an allocation, but _never_ fail */
-	fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+	fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
 	BUG_ON(!fe);
 
 	spin_lock_irqsave(&lc->flush_lock, flags);
@@ -672,7 +680,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 {
 	unsigned long flags;
 	struct log_c *lc = log->context;
-	struct flush_entry *fe;
+	struct dm_dirty_log_flush_entry *fe;
 
 	/*
 	 * If we fail to allocate, we skip the clearing of
@@ -680,7 +688,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 	 * to cause the region to be resync'ed when the
 	 * device is activated next time.
 	 */
-	fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+	fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
 	if (!fe) {
 		DMERR("Failed to allocate memory to clear region.");
 		return;
@@ -886,18 +894,16 @@ static int __init userspace_dirty_log_init(void)
 {
 	int r = 0;
 
-	flush_entry_pool = mempool_create(100, flush_entry_alloc,
-					  flush_entry_free, NULL);
-
-	if (!flush_entry_pool) {
-		DMWARN("Unable to create flush_entry_pool:  No memory.");
+	_flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0);
+	if (!_flush_entry_cache) {
+		DMWARN("Unable to create flush_entry_cache: No memory.");
 		return -ENOMEM;
 	}
 
 	r = dm_ulog_tfr_init();
 	if (r) {
 		DMWARN("Unable to initialize userspace log communications");
-		mempool_destroy(flush_entry_pool);
+		kmem_cache_destroy(_flush_entry_cache);
 		return r;
 	}
 
@@ -905,7 +911,7 @@ static int __init userspace_dirty_log_init(void)
 	if (r) {
 		DMWARN("Couldn't register userspace dirty log type");
 		dm_ulog_tfr_exit();
-		mempool_destroy(flush_entry_pool);
+		kmem_cache_destroy(_flush_entry_cache);
 		return r;
 	}
 
@@ -917,7 +923,7 @@ static void __exit userspace_dirty_log_exit(void)
 {
 	dm_dirty_log_type_unregister(&_userspace_type);
 	dm_ulog_tfr_exit();
-	mempool_destroy(flush_entry_pool);
+	kmem_cache_destroy(_flush_entry_cache);
 
 	DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
 	return;

From 75da39bf256c27e25f395b191ead79f323772672 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Feb 2015 12:58:03 +0000
Subject: [PATCH 02/34] dm cache policy mq: keep track of the number of entries
 in a multiqueue

Small optimisation, now queue_empty() doesn't need to walk all levels of
the multiqueue.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 13f547a4eeb6..ca05d69191e8 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -126,6 +126,7 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 #define NR_QUEUE_LEVELS 16u
 
 struct queue {
+	unsigned nr_elts;
 	struct list_head qs[NR_QUEUE_LEVELS];
 };
 
@@ -133,23 +134,14 @@ static void queue_init(struct queue *q)
 {
 	unsigned i;
 
+	q->nr_elts = 0;
 	for (i = 0; i < NR_QUEUE_LEVELS; i++)
 		INIT_LIST_HEAD(q->qs + i);
 }
 
-/*
- * Checks to see if the queue is empty.
- * FIXME: reduce cpu usage.
- */
 static bool queue_empty(struct queue *q)
 {
-	unsigned i;
-
-	for (i = 0; i < NR_QUEUE_LEVELS; i++)
-		if (!list_empty(q->qs + i))
-			return false;
-
-	return true;
+	return q->nr_elts == 0;
 }
 
 /*
@@ -157,11 +149,13 @@ static bool queue_empty(struct queue *q)
  */
 static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
 {
+	q->nr_elts++;
 	list_add_tail(elt, q->qs + level);
 }
 
-static void queue_remove(struct list_head *elt)
+static void queue_remove(struct queue *q, struct list_head *elt)
 {
+	q->nr_elts--;
 	list_del(elt);
 }
 
@@ -197,6 +191,7 @@ static struct list_head *queue_pop(struct queue *q)
 	struct list_head *r = queue_peek(q);
 
 	if (r) {
+		q->nr_elts--;
 		list_del(r);
 
 		/* have we just emptied the bottom level? */
@@ -496,7 +491,11 @@ static void push(struct mq_policy *mq, struct entry *e)
  */
 static void del(struct mq_policy *mq, struct entry *e)
 {
-	queue_remove(&e->list);
+	if (in_cache(mq, e))
+		queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
+	else
+		queue_remove(&mq->pre_cache, &e->list);
+
 	hash_remove(e);
 }
 

From c74ffc5c63b0b2753bedd49bdc1196d570f66803 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Feb 2015 13:01:22 +0000
Subject: [PATCH 03/34] dm cache policy mq: remove queue_shift_down()

queue_shift_down() didn't adjust the hit_counts to the new levels, so it
just had the effect of scrambling levels.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index ca05d69191e8..3c86b5efe78f 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -159,18 +159,6 @@ static void queue_remove(struct queue *q, struct list_head *elt)
 	list_del(elt);
 }
 
-/*
- * Shifts all regions down one level.  This has no effect on the order of
- * the queue.
- */
-static void queue_shift_down(struct queue *q)
-{
-	unsigned level;
-
-	for (level = 1; level < NR_QUEUE_LEVELS; level++)
-		list_splice_init(q->qs + level, q->qs + level - 1);
-}
-
 /*
  * Gives us the oldest entry of the lowest popoulated level.  If the first
  * level is emptied then we shift down one level.
@@ -193,10 +181,6 @@ static struct list_head *queue_pop(struct queue *q)
 	if (r) {
 		q->nr_elts--;
 		list_del(r);
-
-		/* have we just emptied the bottom level? */
-		if (list_empty(q->qs))
-			queue_shift_down(q);
 	}
 
 	return r;

From 3e45c91e5cdd0cfd3cc1228628602c8e7e587157 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Feb 2015 13:49:45 +0000
Subject: [PATCH 04/34] dm cache policy mq: track entries hit this 'tick' via
 sentinel objects

A sentinel object is placed on each level of the multiqueues.  When an
object is hit it is requeued behind the sentinel.  When the tick is
incremented we iterate through all objects behind the sentinel and
update the hit_count, then reposition the sentinel at the very back.

This saves memory by avoiding tracking the tick explicitly for every
struct entry object in the multiqueues.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 117 ++++++++++++++++++++++----------
 1 file changed, 82 insertions(+), 35 deletions(-)

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 3c86b5efe78f..97b14309df90 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -124,10 +124,12 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
  * sorted queue.
  */
 #define NR_QUEUE_LEVELS 16u
+#define NR_SENTINELS NR_QUEUE_LEVELS * 3
 
 struct queue {
 	unsigned nr_elts;
 	struct list_head qs[NR_QUEUE_LEVELS];
+	struct list_head sentinels[NR_SENTINELS];
 };
 
 static void queue_init(struct queue *q)
@@ -135,8 +137,10 @@ static void queue_init(struct queue *q)
 	unsigned i;
 
 	q->nr_elts = 0;
-	for (i = 0; i < NR_QUEUE_LEVELS; i++)
+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
 		INIT_LIST_HEAD(q->qs + i);
+		INIT_LIST_HEAD(q->sentinels + i);
+	}
 }
 
 static bool queue_empty(struct queue *q)
@@ -159,6 +163,11 @@ static void queue_remove(struct queue *q, struct list_head *elt)
 	list_del(elt);
 }
 
+static bool is_sentinel(struct queue *q, struct list_head *h)
+{
+	return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
+}
+
 /*
  * Gives us the oldest entry of the lowest popoulated level.  If the first
  * level is emptied then we shift down one level.
@@ -166,10 +175,12 @@ static void queue_remove(struct queue *q, struct list_head *elt)
 static struct list_head *queue_peek(struct queue *q)
 {
 	unsigned level;
+	struct list_head *h;
 
 	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		if (!list_empty(q->qs + level))
-			return q->qs[level].next;
+		list_for_each(h, q->qs + level)
+			if (!is_sentinel(q, h))
+				return h;
 
 	return NULL;
 }
@@ -196,6 +207,37 @@ static struct list_head *list_pop(struct list_head *lh)
 	return r;
 }
 
+/*
+ * Sometimes we want to iterate through entries that have been pushed since
+ * a certain event.  We use sentinel entries on the queues to delimit these
+ * 'tick' events.
+ */
+static void queue_tick(struct queue *q)
+{
+	unsigned i;
+
+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+		list_del(q->sentinels + i);
+		list_add_tail(q->sentinels + i, q->qs + i);
+	}
+}
+
+typedef void (*iter_fn)(struct list_head *, void *);
+static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
+{
+	unsigned i;
+	struct list_head *h;
+
+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+		list_for_each_prev(h, q->qs + i) {
+			if (is_sentinel(q, h))
+				break;
+
+			fn(h, context);
+		}
+	}
+}
+
 /*----------------------------------------------------------------*/
 
 /*
@@ -212,7 +254,6 @@ struct entry {
 	bool dirty:1;
 	unsigned hit_count;
 	unsigned generation;
-	unsigned tick;
 };
 
 /*
@@ -460,7 +501,6 @@ static bool in_cache(struct mq_policy *mq, struct entry *e)
  */
 static void push(struct mq_policy *mq, struct entry *e)
 {
-	e->tick = mq->tick;
 	hash_insert(mq, e);
 
 	if (in_cache(mq, e))
@@ -507,14 +547,6 @@ static struct entry *peek(struct queue *q)
 	return h ? container_of(h, struct entry, list) : NULL;
 }
 
-/*
- * Has this entry already been updated?
- */
-static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
-{
-	return mq->tick == e->tick;
-}
-
 /*
  * The promotion threshold is adjusted every generation.  As are the counts
  * of the entries.
@@ -566,20 +598,9 @@ static void check_generation(struct mq_policy *mq)
  * Whenever we use an entry we bump up it's hit counter, and push it to the
  * back to it's current level.
  */
-static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
+static void requeue(struct mq_policy *mq, struct entry *e)
 {
-	if (updated_this_tick(mq, e))
-		return;
-
-	e->hit_count++;
-	mq->hit_count++;
 	check_generation(mq);
-
-	/* generation adjustment, to stop the counts increasing forever. */
-	/* FIXME: divide? */
-	/* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
-	e->generation = mq->generation;
-
 	del(mq, e);
 	push(mq, e);
 }
@@ -686,7 +707,7 @@ static int cache_entry_found(struct mq_policy *mq,
 			     struct entry *e,
 			     struct policy_result *result)
 {
-	requeue_and_update_tick(mq, e);
+	requeue(mq, e);
 
 	if (in_cache(mq, e)) {
 		result->op = POLICY_HIT;
@@ -724,7 +745,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 	new_e->dirty = false;
 	new_e->hit_count = e->hit_count;
 	new_e->generation = e->generation;
-	new_e->tick = e->tick;
 
 	del(mq, e);
 	free_entry(&mq->pre_cache_pool, e);
@@ -740,18 +760,16 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 				 int data_dir, struct policy_result *result)
 {
 	int r = 0;
-	bool updated = updated_this_tick(mq, e);
 
-	if ((!discarded_oblock && updated) ||
-	    !should_promote(mq, e, discarded_oblock, data_dir)) {
-		requeue_and_update_tick(mq, e);
+	if (!should_promote(mq, e, discarded_oblock, data_dir)) {
+		requeue(mq, e);
 		result->op = POLICY_MISS;
 
 	} else if (!can_migrate)
 		r = -EWOULDBLOCK;
 
 	else {
-		requeue_and_update_tick(mq, e);
+		requeue(mq, e);
 		r = pre_cache_to_cache(mq, e, result);
 	}
 
@@ -888,12 +906,36 @@ static void mq_destroy(struct dm_cache_policy *p)
 	kfree(mq);
 }
 
+static void update_pre_cache_hits(struct list_head *h, void *context)
+{
+	struct entry *e = container_of(h, struct entry, list);
+	e->hit_count++;
+}
+
+static void update_cache_hits(struct list_head *h, void *context)
+{
+	struct mq_policy *mq = context;
+	struct entry *e = container_of(h, struct entry, list);
+	e->hit_count++;
+	mq->hit_count++;
+}
+
 static void copy_tick(struct mq_policy *mq)
 {
-	unsigned long flags;
+	unsigned long flags, tick;
 
 	spin_lock_irqsave(&mq->tick_lock, flags);
-	mq->tick = mq->tick_protected;
+	tick = mq->tick_protected;
+	if (tick != mq->tick) {
+		queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
+		queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
+		queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
+		mq->tick = tick;
+	}
+
+	queue_tick(&mq->pre_cache);
+	queue_tick(&mq->cache_dirty);
+	queue_tick(&mq->cache_clean);
 	spin_unlock_irqrestore(&mq->tick_lock, flags);
 }
 
@@ -995,10 +1037,15 @@ static int mq_save_hints(struct mq_policy *mq, struct queue *q,
 {
 	int r;
 	unsigned level;
+	struct list_head *h;
 	struct entry *e;
 
 	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		list_for_each_entry(e, q->qs + level, list) {
+		list_for_each(h, q->qs + level) {
+			if (is_sentinel(q, h))
+				continue;
+
+			e = container_of(h, struct entry, list);
 			r = fn(context, infer_cblock(&mq->cache_pool, e),
 			       e->oblock, e->hit_count);
 			if (r)

From fdecee3224d90e51c611198baeb0c38e568ca0e8 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Feb 2015 13:54:14 +0000
Subject: [PATCH 05/34] dm cache policy mq: remove unused generation member of
 struct entry

Remove to stop wasting memory.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 97b14309df90..6bfb39411fa9 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -253,7 +253,6 @@ struct entry {
 	 */
 	bool dirty:1;
 	unsigned hit_count;
-	unsigned generation;
 };
 
 /*
@@ -744,7 +743,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 	new_e->oblock = e->oblock;
 	new_e->dirty = false;
 	new_e->hit_count = e->hit_count;
-	new_e->generation = e->generation;
 
 	del(mq, e);
 	free_entry(&mq->pre_cache_pool, e);
@@ -796,7 +794,6 @@ static void insert_in_pre_cache(struct mq_policy *mq,
 	e->dirty = false;
 	e->oblock = oblock;
 	e->hit_count = 1;
-	e->generation = mq->generation;
 	push(mq, e);
 }
 
@@ -829,7 +826,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 	e->oblock = oblock;
 	e->dirty = false;
 	e->hit_count = 1;
-	e->generation = mq->generation;
 	push(mq, e);
 
 	result->cblock = infer_cblock(&mq->cache_pool, e);
@@ -1026,7 +1022,6 @@ static int mq_load_mapping(struct dm_cache_policy *p,
 	e->oblock = oblock;
 	e->dirty = false;	/* this gets corrected in a minute */
 	e->hit_count = hint_valid ? hint : 1;
-	e->generation = mq->generation;
 	push(mq, e);
 
 	return 0;

From e65ff8703f56273c6dc8b77373f4d2bef6e35107 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Feb 2015 14:22:17 +0000
Subject: [PATCH 06/34] dm cache policy mq: try not to writeback data that
 changed in the last second

Writeback takes out a lock on the cache block, so will increase the
latency for any concurrent io.

This patch works by placing 2 sentinel objects on each level of the
multiqueues.  Every WRITEBACK_PERIOD the oldest sentinel gets moved to
the newest end of the queue level.

When looking for writeback work:
  if less than 25% of the cache is clean:
    we select the oldest object with the lowest hit count
  otherwise:
    we select the oldest object that is not past a writeback sentinel.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 94 ++++++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 6bfb39411fa9..3ddd1162334d 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -8,6 +8,7 @@
 #include "dm.h"
 
 #include <linux/hash.h>
+#include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
@@ -126,8 +127,12 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 #define NR_QUEUE_LEVELS 16u
 #define NR_SENTINELS NR_QUEUE_LEVELS * 3
 
+#define WRITEBACK_PERIOD HZ
+
 struct queue {
 	unsigned nr_elts;
+	bool current_writeback_sentinels;
+	unsigned long next_writeback;
 	struct list_head qs[NR_QUEUE_LEVELS];
 	struct list_head sentinels[NR_SENTINELS];
 };
@@ -137,12 +142,21 @@ static void queue_init(struct queue *q)
 	unsigned i;
 
 	q->nr_elts = 0;
+	q->current_writeback_sentinels = false;
+	q->next_writeback = 0;
 	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
 		INIT_LIST_HEAD(q->qs + i);
 		INIT_LIST_HEAD(q->sentinels + i);
+		INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
+		INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
 	}
 }
 
+static unsigned queue_size(struct queue *q)
+{
+	return q->nr_elts;
+}
+
 static bool queue_empty(struct queue *q)
 {
 	return q->nr_elts == 0;
@@ -197,6 +211,27 @@ static struct list_head *queue_pop(struct queue *q)
 	return r;
 }
 
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct list_head *queue_pop_old(struct queue *q)
+{
+	unsigned level;
+	struct list_head *h;
+
+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
+		list_for_each(h, q->qs + level) {
+			if (is_sentinel(q, h))
+				break;
+
+			q->nr_elts--;
+			list_del(h);
+			return h;
+		}
+
+	return NULL;
+}
+
 static struct list_head *list_pop(struct list_head *lh)
 {
 	struct list_head *r = lh->next;
@@ -207,6 +242,31 @@ static struct list_head *list_pop(struct list_head *lh)
 	return r;
 }
 
+static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
+{
+	if (q->current_writeback_sentinels)
+		return q->sentinels + NR_QUEUE_LEVELS + level;
+	else
+		return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
+}
+
+static void queue_update_writeback_sentinels(struct queue *q)
+{
+	unsigned i;
+	struct list_head *h;
+
+	if (time_after(jiffies, q->next_writeback)) {
+		for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+			h = writeback_sentinel(q, i);
+			list_del(h);
+			list_add_tail(h, q->qs + i);
+		}
+
+		q->next_writeback = jiffies + WRITEBACK_PERIOD;
+		q->current_writeback_sentinels = !q->current_writeback_sentinels;
+	}
+}
+
 /*
  * Sometimes we want to iterate through entries that have been pushed since
  * a certain event.  We use sentinel entries on the queues to delimit these
@@ -540,6 +600,20 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
 	return e;
 }
 
+static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
+{
+	struct entry *e;
+	struct list_head *h = queue_pop_old(q);
+
+	if (!h)
+		return NULL;
+
+	e = container_of(h, struct entry, list);
+	hash_remove(e);
+
+	return e;
+}
+
 static struct entry *peek(struct queue *q)
 {
 	struct list_head *h = queue_peek(q);
@@ -932,6 +1006,7 @@ static void copy_tick(struct mq_policy *mq)
 	queue_tick(&mq->pre_cache);
 	queue_tick(&mq->cache_dirty);
 	queue_tick(&mq->cache_clean);
+	queue_update_writeback_sentinels(&mq->cache_dirty);
 	spin_unlock_irqrestore(&mq->tick_lock, flags);
 }
 
@@ -1112,10 +1187,27 @@ static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
 	return r;
 }
 
+#define CLEAN_TARGET_PERCENTAGE 25
+
+static bool clean_target_met(struct mq_policy *mq)
+{
+	/*
+	 * Cache entries may not be populated.  So we're cannot rely on the
+	 * size of the clean queue.
+	 */
+	unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
+	unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
+
+	return nr_clean >= target;
+}
+
 static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
 			      dm_cblock_t *cblock)
 {
-	struct entry *e = pop(mq, &mq->cache_dirty);
+	struct entry *e = pop_old(mq, &mq->cache_dirty);
+
+	if (!e && !clean_target_met(mq))
+		e = pop(mq, &mq->cache_dirty);
 
 	if (!e)
 		return -ENODATA;

From e73f6e8a0de8ce28edef986d86720ef83dd2864f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 27 Feb 2015 15:25:31 -0500
Subject: [PATCH 07/34] dm switch: fix Documentation to use plain text

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/switch.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/device-mapper/switch.txt b/Documentation/device-mapper/switch.txt
index 8897d0494838..424835e57f27 100644
--- a/Documentation/device-mapper/switch.txt
+++ b/Documentation/device-mapper/switch.txt
@@ -47,8 +47,8 @@ consume far too much memory.
 Using this device-mapper switch target we can now build a two-layer
 device hierarchy:
 
-    Upper Tier – Determine which array member the I/O should be sent to.
-    Lower Tier – Load balance amongst paths to a particular member.
+    Upper Tier - Determine which array member the I/O should be sent to.
+    Lower Tier - Load balance amongst paths to a particular member.
 
 The lower tier consists of a single dm multipath device for each member.
 Each of these multipath devices contains the set of paths directly to

From 09c2d53101da87f5ab4084643d2f8c718b3ab3cf Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 27 Feb 2015 22:25:26 -0500
Subject: [PATCH 08/34] dm: rename __dm_get_reserved_ios() helper to
 __dm_get_module_param()

__dm_get_module_param() could be useful for future DM module parameters
besides those related to "reserved_ios".

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8001fe9e3434..0e5f3441fcda 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -250,35 +250,35 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
  */
 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 
-static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
+static unsigned __dm_get_module_param(unsigned *module_param,
 				      unsigned def, unsigned max)
 {
-	unsigned ios = ACCESS_ONCE(*reserved_ios);
-	unsigned modified_ios = 0;
+	unsigned param = ACCESS_ONCE(*module_param);
+	unsigned modified_param = 0;
 
-	if (!ios)
-		modified_ios = def;
-	else if (ios > max)
-		modified_ios = max;
+	if (!param)
+		modified_param = def;
+	else if (param > max)
+		modified_param = max;
 
-	if (modified_ios) {
-		(void)cmpxchg(reserved_ios, ios, modified_ios);
-		ios = modified_ios;
+	if (modified_param) {
+		(void)cmpxchg(module_param, param, modified_param);
+		param = modified_param;
 	}
 
-	return ios;
+	return param;
 }
 
 unsigned dm_get_reserved_bio_based_ios(void)
 {
-	return __dm_get_reserved_ios(&reserved_bio_based_ios,
+	return __dm_get_module_param(&reserved_bio_based_ios,
 				     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 
 unsigned dm_get_reserved_rq_based_ios(void)
 {
-	return __dm_get_reserved_ios(&reserved_rq_based_ios,
+	return __dm_get_module_param(&reserved_rq_based_ios,
 				     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);

From 52b09914af86fa3e728175c1125c91520e437b2f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 23 Feb 2015 16:36:41 -0500
Subject: [PATCH 09/34] dm: remove unnecessary wrapper around blk_lld_busy

There is no need for DM to export a wrapper around the already exported
blk_lld_busy().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c         | 2 +-
 drivers/md/dm.c               | 6 ------
 include/linux/device-mapper.h | 5 -----
 3 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d376dc87716e..add6391f3f8e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1627,7 +1627,7 @@ static int __pgpath_busy(struct pgpath *pgpath)
 {
 	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
 
-	return dm_underlying_device_busy(q);
+	return blk_lld_busy(q);
 }
 
 /*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0e5f3441fcda..e7095ebb8d64 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2006,12 +2006,6 @@ out:
 	dm_put_live_table(md, srcu_idx);
 }
 
-int dm_underlying_device_busy(struct request_queue *q)
-{
-	return blk_lld_busy(q);
-}
-EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
-
 static int dm_lld_busy(struct request_queue *q)
 {
 	int r;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index fd23978d93fe..51cc1deb7af3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -605,9 +605,4 @@ static inline unsigned long to_bytes(sector_t n)
 	return (n << SECTOR_SHIFT);
 }
 
-/*-----------------------------------------------------------------
- * Helper for block layer and dm core operations
- *---------------------------------------------------------------*/
-int dm_underlying_device_busy(struct request_queue *q);
-
 #endif	/* _LINUX_DEVICE_MAPPER_H */

From d56b9b28a4a5d9e61dd99154b986e760373e2392 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 23 Feb 2015 19:10:15 -0500
Subject: [PATCH 10/34] dm: remove request-based DM queue's lld_busy_fn hook

DM multipath is the only caller of blk_lld_busy() -- which calls a
queue's lld_busy_fn hook.  Request-based DM doesn't support stacking
multipath devices so there is no reason to register the lld_busy_fn hook
on a multipath device's queue using blk_queue_lld_busy().

As such, remove functions dm_lld_busy and dm_table_any_busy_target.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 14 --------------
 drivers/md/dm.c       | 17 -----------------
 drivers/md/dm.h       |  1 -
 3 files changed, 32 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 6554d9148927..057312048b68 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1677,20 +1677,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 	return r;
 }
 
-int dm_table_any_busy_target(struct dm_table *t)
-{
-	unsigned i;
-	struct dm_target *ti;
-
-	for (i = 0; i < t->num_targets; i++) {
-		ti = t->targets + i;
-		if (ti->type->busy && ti->type->busy(ti))
-			return 1;
-	}
-
-	return 0;
-}
-
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
 	return t->md;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e7095ebb8d64..cc8aed2e3f88 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2006,22 +2006,6 @@ out:
 	dm_put_live_table(md, srcu_idx);
 }
 
-static int dm_lld_busy(struct request_queue *q)
-{
-	int r;
-	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table_fast(md);
-
-	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
-		r = 1;
-	else
-		r = dm_table_any_busy_target(map);
-
-	dm_put_live_table_fast(md);
-
-	return r;
-}
-
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r = bdi_bits;
@@ -2545,7 +2529,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 	dm_init_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
-	blk_queue_lld_busy(md->queue, dm_lld_busy);
 
 	/* Also initialize the request-based DM worker thread */
 	init_kthread_worker(&md->kworker);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 59f53e79db82..db495863fa5f 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -70,7 +70,6 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);

From ff36ab34583ae23250a4bf39805d69771e7e0131 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 23 Feb 2015 17:56:37 -0500
Subject: [PATCH 11/34] dm: remove request-based logic from make_request_fn
 wrapper

The old dm_request() method used for q->make_request_fn had a branch for
request-based DM support but it isn't needed given that
dm_init_request_based_queue() sets it to the standard blk_queue_bio()
anyway.

Cleanup dm_init_md_queue() to be DM device-type agnostic and have
dm_setup_md_queue() properly finish queue setup based on DM device-type
(bio-based vs request-based).

A followup block patch can be made to remove the export for
blk_queue_bio() now that DM no longer calls it directly.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index cc8aed2e3f88..43e0d1a85a60 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1693,7 +1693,7 @@ out:
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
  */
-static void _dm_request(struct request_queue *q, struct bio *bio)
+static void dm_make_request(struct request_queue *q, struct bio *bio)
 {
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
@@ -1725,16 +1725,6 @@ int dm_request_based(struct mapped_device *md)
 	return blk_queue_stackable(md->queue);
 }
 
-static void dm_request(struct request_queue *q, struct bio *bio)
-{
-	struct mapped_device *md = q->queuedata;
-
-	if (dm_request_based(md))
-		blk_queue_bio(q, bio);
-	else
-		_dm_request(q, bio);
-}
-
 static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 {
 	int r;
@@ -2100,9 +2090,8 @@ static void dm_init_md_queue(struct mapped_device *md)
 	md->queue->queuedata = md;
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
-	blk_queue_make_request(md->queue, dm_request);
+
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
-	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 }
 
 /*
@@ -2335,7 +2324,7 @@ int dm_queue_merge_is_compulsory(struct request_queue *q)
 	if (!q->merge_bvec_fn)
 		return 0;
 
-	if (q->make_request_fn == dm_request) {
+	if (q->make_request_fn == dm_make_request) {
 		dev_md = q->queuedata;
 		if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
 			return 0;
@@ -2545,9 +2534,15 @@ static int dm_init_request_based_queue(struct mapped_device *md)
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
-	if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
-		DMWARN("Cannot initialize queue for request-based mapped device");
-		return -EINVAL;
+	if (dm_md_type_request_based(md)) {
+		if (!dm_init_request_based_queue(md)) {
+			DMWARN("Cannot initialize queue for request-based mapped device");
+			return -EINVAL;
+		}
+	} else {
+		/* bio-based specific initialization */
+		blk_queue_make_request(md->queue, dm_make_request);
+		blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 	}
 
 	return 0;

From 9a0e609e3fd8a95c96629b9fbde6b8c5b9a1456a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 24 Feb 2015 11:03:22 -0500
Subject: [PATCH 12/34] dm: only run the queue on completion if congested or no
 requests pending

On really fast storage it can be beneficial to delay running the
request_queue to allow the elevator more opportunity to merge requests.

Otherwise, it has been observed that requests are being sent to
q->request_fn much quicker than is ideal on IOPS-bound backends.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 43e0d1a85a60..7924c00e0716 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1024,10 +1024,13 @@ static void end_clone_bio(struct bio *clone, int error)
  */
 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 {
+	int nr_requests_pending;
+
 	atomic_dec(&md->pending[rw]);
 
 	/* nudge anyone waiting on suspend queue */
-	if (!md_in_flight(md))
+	nr_requests_pending = md_in_flight(md);
+	if (!nr_requests_pending)
 		wake_up(&md->wait);
 
 	/*
@@ -1036,8 +1039,11 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 	 * back into ->request_fn() could deadlock attempting to grab the
 	 * queue lock again.
 	 */
-	if (run_queue)
-		blk_run_queue_async(md->queue);
+	if (run_queue) {
+		if (!nr_requests_pending ||
+		    (nr_requests_pending >= md->queue->nr_congestion_on))
+			blk_run_queue_async(md->queue);
+	}
 
 	/*
 	 * dm_put() must be at the end of this function. See the comment above

From 9d1deb83d489364f8749a3a1ba1689efb07d94b0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 24 Feb 2015 20:49:18 -0500
Subject: [PATCH 13/34] dm: don't schedule delayed run of the queue if nothing
 to do

In request-based DM's dm_request_fn(), if blk_peek_request() returns
NULL just return.  Avoids unnecessary blk_delay_queue().

Reported-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7924c00e0716..6f854287384b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1963,7 +1963,7 @@ static void dm_request_fn(struct request_queue *q)
 	while (!blk_queue_stopped(q)) {
 		rq = blk_peek_request(q);
 		if (!rq)
-			goto delay_and_out;
+			goto out;
 
 		/* always use block 0 to find the target for flushes for now */
 		pos = 0;

From d548b34b062b60b4f4df295a0b4823dfda1f1fc4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 5 Mar 2015 22:21:10 -0500
Subject: [PATCH 14/34] dm: reduce the queue delay used in dm_request_fn from
 100ms to 10ms

Commit 7eaceaccab ("block: remove per-queue plugging") didn't justify
DM's use of a 100ms delay; such an extended delay is a liability when
there is reason to re-kick the queue.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6f854287384b..98eb02d32e6e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1997,7 +1997,7 @@ static void dm_request_fn(struct request_queue *q)
 	goto out;
 
 delay_and_out:
-	blk_delay_queue(q, HZ / 10);
+	blk_delay_queue(q, HZ / 100);
 out:
 	dm_put_live_table(md, srcu_idx);
 }

From de3ec86dff160d35c817bb70eeaeff6e392f44a4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 24 Feb 2015 21:58:21 -0500
Subject: [PATCH 15/34] dm: don't start current request if it would've merged
 with the previous

Request-based DM's dm_request_fn() is so fast to pull requests off the
queue that steps need to be taken to promote merging by avoiding request
processing if it makes sense.

If the current request would've merged with previous request let the
current request stay on the queue longer.

Suggested-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 98eb02d32e6e..2ae78b31e4c0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,7 @@
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
+#include <linux/elevator.h> /* for rq_end_sector() */
 
 #include <trace/events/block.h>
 
@@ -216,6 +217,10 @@ struct mapped_device {
 
 	struct kthread_worker kworker;
 	struct task_struct *kworker_task;
+
+	/* for request-based merge heuristic in dm_request_fn() */
+	sector_t last_rq_pos;
+	int last_rq_rw;
 };
 
 /*
@@ -1930,6 +1935,9 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 	blk_start_request(orig);
 	atomic_inc(&md->pending[rq_data_dir(orig)]);
 
+	md->last_rq_pos = rq_end_sector(orig);
+	md->last_rq_rw = rq_data_dir(orig);
+
 	/*
 	 * Hold the md reference here for the in-flight I/O.
 	 * We can't rely on the reference count by device opener,
@@ -1982,6 +1990,10 @@ static void dm_request_fn(struct request_queue *q)
 			continue;
 		}
 
+		if (md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+		    md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
+			goto delay_and_out;
+
 		if (ti->type->busy && ti->type->busy(ti))
 			goto delay_and_out;
 

From b898320d683d54c2bc17b748b9742d2b601ad453 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 27 Feb 2015 17:58:42 -0500
Subject: [PATCH 16/34] dm sysfs: introduce ability to add writable attributes

Add DM_ATTR_RW() macro and establish .store method in dm_sysfs_ops.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-sysfs.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index c62c5ab6aed5..1271c31709fd 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -11,7 +11,7 @@
 struct dm_sysfs_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct mapped_device *, char *);
-	ssize_t (*store)(struct mapped_device *, char *);
+	ssize_t (*store)(struct mapped_device *, const char *, size_t count);
 };
 
 #define DM_ATTR_RO(_name) \
@@ -39,6 +39,31 @@ static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
 	return ret;
 }
 
+#define DM_ATTR_RW(_name) \
+struct dm_sysfs_attr dm_attr_##_name = \
+	__ATTR(_name, S_IRUGO | S_IWUSR, dm_attr_##_name##_show, dm_attr_##_name##_store)
+
+static ssize_t dm_attr_store(struct kobject *kobj, struct attribute *attr,
+			     const char *page, size_t count)
+{
+	struct dm_sysfs_attr *dm_attr;
+	struct mapped_device *md;
+	ssize_t ret;
+
+	dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
+	if (!dm_attr->store)
+		return -EIO;
+
+	md = dm_get_from_kobject(kobj);
+	if (!md)
+		return -EINVAL;
+
+	ret = dm_attr->store(md, page, count);
+	dm_put(md);
+
+	return ret;
+}
+
 static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
 {
 	if (dm_copy_name_and_uuid(md, buf, NULL))
@@ -77,12 +102,9 @@ static struct attribute *dm_attrs[] = {
 
 static const struct sysfs_ops dm_sysfs_ops = {
 	.show	= dm_attr_show,
+	.store	= dm_attr_store,
 };
 
-/*
- * dm kobject is embedded in mapped_device structure
- * no need to define release function here
- */
 static struct kobj_type dm_ktype = {
 	.sysfs_ops	= &dm_sysfs_ops,
 	.default_attrs	= dm_attrs,

From 0ce65797a77ee780f62909d3128bf08b9735718b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 26 Feb 2015 00:50:28 -0500
Subject: [PATCH 17/34] dm: impose configurable deadline for dm_request_fn's
 merge heuristic

Otherwise, for sequential workloads, the dm_request_fn can allow
excessive request merging at the expense of increased service time.

Add a per-device sysfs attribute to allow the user to control how long a
request, that is a reasonable merge candidate, can be queued on the
request queue.  The resolution of this request dispatch deadline is in
microseconds (ranging from 1 to 100000 usecs), to set a 20us deadline:
  echo 20 > /sys/block/dm-7/dm/rq_based_seq_io_merge_deadline

The dm_request_fn's merge heuristic and associated extra accounting is
disabled by default (rq_based_seq_io_merge_deadline is 0).

This sysfs attribute is not applicable to bio-based DM devices so it
will only ever report 0 for them.

By allowing a request to remain on the queue it will block others
requests on the queue.  But introducing a short dequeue delay has proven
very effective at enabling certain sequential IO workloads on really
fast, yet IOPS constrained, devices to build up slightly larger IOs --
yielding 90+% throughput improvements.  Having precise control over the
time taken to wait for larger requests to build affords control beyond
that of waiting for certain IO sizes to accumulate (which would require
a deadline anyway).  This knob will only ever make sense with sequential
IO workloads and the particular value used is storage configuration
specific.

Given the expected niche use-case for when this knob is useful it has
been deemed acceptable to expose this relatively crude method for
crafting optimal IO on specific storage -- especially given the solution
is simple yet effective.  In the context of DM multipath, it is
advisable to tune this sysfs attribute to a value that offers the best
performance for the common case (e.g. if 4 paths are expected active,
tune for that; if paths fail then performance may be slightly reduced).

Alternatives were explored to have request-based DM autotune this value
(e.g. if/when paths fail) but they were quickly deemed too fragile and
complex to warrant further design and development time.  If this problem
proves more common as faster storage emerges we'll have to look at
elevating a generic solution into the block core.

Tested-by: Shiva Krishna Merla <shivakrishna.merla@netapp.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/ABI/testing/sysfs-block-dm | 14 ++++++
 drivers/md/dm-sysfs.c                    |  2 +
 drivers/md/dm.c                          | 57 ++++++++++++++++++++++--
 drivers/md/dm.h                          |  4 ++
 4 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-block-dm b/Documentation/ABI/testing/sysfs-block-dm
index 87ca5691e29b..ac4b6fe245d9 100644
--- a/Documentation/ABI/testing/sysfs-block-dm
+++ b/Documentation/ABI/testing/sysfs-block-dm
@@ -23,3 +23,17 @@ Description:	Device-mapper device suspend state.
 		Contains the value 1 while the device is suspended.
 		Otherwise it contains 0. Read-only attribute.
 Users:		util-linux, device-mapper udev rules
+
+What:		/sys/block/dm-<num>/dm/rq_based_seq_io_merge_deadline
+Date:		March 2015
+KernelVersion:	4.1
+Contact:	dm-devel@redhat.com
+Description:	Allow control over how long a request that is a
+		reasonable merge candidate can be queued on the request
+		queue.  The resolution of this deadline is in
+		microseconds (ranging from 1 to 100000 usecs).
+		Setting this attribute to 0 (the default) will disable
+		request-based DM's merge heuristic and associated extra
+		accounting.  This attribute is not applicable to
+		bio-based DM devices so it will only ever report 0 for
+		them.
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 1271c31709fd..f5bb3944f75e 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -92,11 +92,13 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
 static DM_ATTR_RO(name);
 static DM_ATTR_RO(uuid);
 static DM_ATTR_RO(suspended);
+static DM_ATTR_RW(rq_based_seq_io_merge_deadline);
 
 static struct attribute *dm_attrs[] = {
 	&dm_attr_name.attr,
 	&dm_attr_uuid.attr,
 	&dm_attr_suspended.attr,
+	&dm_attr_rq_based_seq_io_merge_deadline.attr,
 	NULL,
 };
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2ae78b31e4c0..5294e016e92b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,7 @@
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
+#include <linux/ktime.h>
 #include <linux/elevator.h> /* for rq_end_sector() */
 
 #include <trace/events/block.h>
@@ -219,8 +220,10 @@ struct mapped_device {
 	struct task_struct *kworker_task;
 
 	/* for request-based merge heuristic in dm_request_fn() */
-	sector_t last_rq_pos;
+	unsigned seq_rq_merge_deadline_usecs;
 	int last_rq_rw;
+	sector_t last_rq_pos;
+	ktime_t last_rq_start_time;
 };
 
 /*
@@ -1935,8 +1938,11 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 	blk_start_request(orig);
 	atomic_inc(&md->pending[rq_data_dir(orig)]);
 
-	md->last_rq_pos = rq_end_sector(orig);
-	md->last_rq_rw = rq_data_dir(orig);
+	if (md->seq_rq_merge_deadline_usecs) {
+		md->last_rq_pos = rq_end_sector(orig);
+		md->last_rq_rw = rq_data_dir(orig);
+		md->last_rq_start_time = ktime_get();
+	}
 
 	/*
 	 * Hold the md reference here for the in-flight I/O.
@@ -1948,6 +1954,45 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 	dm_get(md);
 }
 
+#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+{
+	return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
+}
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+						     const char *buf, size_t count)
+{
+	unsigned deadline;
+
+	if (!dm_request_based(md))
+		return count;
+
+	if (kstrtouint(buf, 10, &deadline))
+		return -EINVAL;
+
+	if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
+		deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
+
+	md->seq_rq_merge_deadline_usecs = deadline;
+
+	return count;
+}
+
+static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
+{
+	ktime_t kt_deadline;
+
+	if (!md->seq_rq_merge_deadline_usecs)
+		return false;
+
+	kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
+	kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
+
+	return !ktime_after(ktime_get(), kt_deadline);
+}
+
 /*
  * q->request_fn for request-based dm.
  * Called with the queue lock held.
@@ -1990,7 +2035,8 @@ static void dm_request_fn(struct request_queue *q)
 			continue;
 		}
 
-		if (md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+		if (dm_request_peeked_before_merge_deadline(md) &&
+		    md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
 		    md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
 			goto delay_and_out;
 
@@ -2532,6 +2578,9 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 	if (!q)
 		return 0;
 
+	/* disable dm_request_fn's merge heuristic by default */
+	md->seq_rq_merge_deadline_usecs = 0;
+
 	md->queue = q;
 	dm_init_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index db495863fa5f..5522422cc6c4 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -234,4 +234,8 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
 	return !maxlen || strlen(result) + 1 >= maxlen;
 }
 
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+						     const char *buf, size_t count);
+
 #endif

From bfebd1cdb497a57757c83f5fbf1a29931591e2a4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sun, 8 Mar 2015 00:51:47 -0500
Subject: [PATCH 18/34] dm: add full blk-mq support to request-based DM

Commit e5863d9ad ("dm: allocate requests in target when stacking on
blk-mq devices") served as the first step toward fully utilizing blk-mq
in request-based DM -- it enabled stacking an old-style (request_fn)
request_queue ontop of the underlying blk-mq device(s).  That first step
didn't improve performance of DM multipath ontop of fast blk-mq devices
(e.g. NVMe) because the top-level old-style request_queue was severely
limited by the queue_lock.

The second step offered here enables stacking a blk-mq request_queue
ontop of the underlying blk-mq device(s).  This unlocks significant
performance gains on fast blk-mq devices, Keith Busch tested on his NVMe
testbed and offered this really positive news:

 "Just providing a performance update. All my fio tests are getting
  roughly equal performance whether accessed through the raw block
  device or the multipath device mapper (~470k IOPS). I could only push
  ~20% of the raw iops through dm before this conversion, so this latest
  tree is looking really solid from a performance standpoint."

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Tested-by: Keith Busch <keith.busch@intel.com>
---
 drivers/md/dm-mpath.c         |   2 +-
 drivers/md/dm-table.c         |  11 +-
 drivers/md/dm.c               | 317 +++++++++++++++++++++++++++-------
 include/uapi/linux/dm-ioctl.h |   4 +-
 4 files changed, 261 insertions(+), 73 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index add6391f3f8e..c8f07e5a9a17 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1703,7 +1703,7 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 8, 0},
+	.version = {1, 9, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 057312048b68..66600cab9fa5 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/blk-mq.h>
 
 #define DM_MSG_PREFIX "table"
 
@@ -1695,9 +1696,13 @@ void dm_table_run_md_queue_async(struct dm_table *t)
 	md = dm_table_get_md(t);
 	queue = dm_get_md_queue(md);
 	if (queue) {
-		spin_lock_irqsave(queue->queue_lock, flags);
-		blk_run_queue_async(queue);
-		spin_unlock_irqrestore(queue->queue_lock, flags);
+		if (queue->mq_ops)
+			blk_mq_run_hw_queues(queue, true);
+		else {
+			spin_lock_irqsave(queue->queue_lock, flags);
+			blk_run_queue_async(queue);
+			spin_unlock_irqrestore(queue->queue_lock, flags);
+		}
 	}
 }
 EXPORT_SYMBOL(dm_table_run_md_queue_async);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5294e016e92b..3a66baac76ed 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -23,6 +23,7 @@
 #include <linux/kthread.h>
 #include <linux/ktime.h>
 #include <linux/elevator.h> /* for rq_end_sector() */
+#include <linux/blk-mq.h>
 
 #include <trace/events/block.h>
 
@@ -224,6 +225,9 @@ struct mapped_device {
 	int last_rq_rw;
 	sector_t last_rq_pos;
 	ktime_t last_rq_start_time;
+
+	/* for blk-mq request-based DM support */
+	struct blk_mq_tag_set tag_set;
 };
 
 /*
@@ -1025,6 +1029,11 @@ static void end_clone_bio(struct bio *clone, int error)
 	blk_update_request(tio->orig, 0, nr_bytes);
 }
 
+static struct dm_rq_target_io *tio_from_request(struct request *rq)
+{
+	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+}
+
 /*
  * Don't touch any member of the md after calling this function because
  * the md may be freed in dm_put() at the end of this function.
@@ -1048,8 +1057,10 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 	 * queue lock again.
 	 */
 	if (run_queue) {
-		if (!nr_requests_pending ||
-		    (nr_requests_pending >= md->queue->nr_congestion_on))
+		if (md->queue->mq_ops)
+			blk_mq_run_hw_queues(md->queue, true);
+		else if (!nr_requests_pending ||
+			 (nr_requests_pending >= md->queue->nr_congestion_on))
 			blk_run_queue_async(md->queue);
 	}
 
@@ -1062,13 +1073,17 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 static void free_rq_clone(struct request *clone)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
 
 	blk_rq_unprep_clone(clone);
+
 	if (clone->q && clone->q->mq_ops)
 		tio->ti->type->release_clone_rq(clone);
 	else
-		free_clone_request(tio->md, clone);
-	free_rq_tio(tio);
+		free_clone_request(md, clone);
+
+	if (!md->queue->mq_ops)
+		free_rq_tio(tio);
 }
 
 /*
@@ -1097,17 +1112,22 @@ static void dm_end_request(struct request *clone, int error)
 	}
 
 	free_rq_clone(clone);
-	blk_end_request_all(rq, error);
+	if (!rq->q->mq_ops)
+		blk_end_request_all(rq, error);
+	else
+		blk_mq_end_request(rq, error);
 	rq_completed(md, rw, true);
 }
 
 static void dm_unprep_request(struct request *rq)
 {
-	struct dm_rq_target_io *tio = rq->special;
+	struct dm_rq_target_io *tio = tio_from_request(rq);
 	struct request *clone = tio->clone;
 
-	rq->special = NULL;
-	rq->cmd_flags &= ~REQ_DONTPREP;
+	if (!rq->q->mq_ops) {
+		rq->special = NULL;
+		rq->cmd_flags &= ~REQ_DONTPREP;
+	}
 
 	if (clone)
 		free_rq_clone(clone);
@@ -1116,18 +1136,29 @@ static void dm_unprep_request(struct request *rq)
 /*
  * Requeue the original request of a clone.
  */
-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
-						 struct request *rq)
+static void old_requeue_request(struct request *rq)
 {
-	int rw = rq_data_dir(rq);
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 
-	dm_unprep_request(rq);
-
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_requeue_request(q, rq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+						 struct request *rq)
+{
+	int rw = rq_data_dir(rq);
+
+	dm_unprep_request(rq);
+
+	if (!rq->q->mq_ops)
+		old_requeue_request(rq);
+	else {
+		blk_mq_requeue_request(rq);
+		blk_mq_kick_requeue_list(rq->q);
+	}
 
 	rq_completed(md, rw, false);
 }
@@ -1139,33 +1170,42 @@ static void dm_requeue_unmapped_request(struct request *clone)
 	dm_requeue_unmapped_original_request(tio->md, tio->orig);
 }
 
-static void __stop_queue(struct request_queue *q)
+static void old_stop_queue(struct request_queue *q)
 {
+	unsigned long flags;
+
+	if (blk_queue_stopped(q))
+		return;
+
+	spin_lock_irqsave(q->queue_lock, flags);
 	blk_stop_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 static void stop_queue(struct request_queue *q)
+{
+	if (!q->mq_ops)
+		old_stop_queue(q);
+	else
+		blk_mq_stop_hw_queues(q);
+}
+
+static void old_start_queue(struct request_queue *q)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	__stop_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void __start_queue(struct request_queue *q)
-{
 	if (blk_queue_stopped(q))
 		blk_start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 static void start_queue(struct request_queue *q)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	__start_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	if (!q->mq_ops)
+		old_start_queue(q);
+	else
+		blk_mq_start_stopped_hw_queues(q, true);
 }
 
 static void dm_done(struct request *clone, int error, bool mapped)
@@ -1206,13 +1246,20 @@ static void dm_done(struct request *clone, int error, bool mapped)
 static void dm_softirq_done(struct request *rq)
 {
 	bool mapped = true;
-	struct dm_rq_target_io *tio = rq->special;
+	struct dm_rq_target_io *tio = tio_from_request(rq);
 	struct request *clone = tio->clone;
+	int rw;
 
 	if (!clone) {
-		blk_end_request_all(rq, tio->error);
-		rq_completed(tio->md, rq_data_dir(rq), false);
-		free_rq_tio(tio);
+		rw = rq_data_dir(rq);
+		if (!rq->q->mq_ops) {
+			blk_end_request_all(rq, tio->error);
+			rq_completed(tio->md, rw, false);
+			free_rq_tio(tio);
+		} else {
+			blk_mq_end_request(rq, tio->error);
+			rq_completed(tio->md, rw, false);
+		}
 		return;
 	}
 
@@ -1228,7 +1275,7 @@ static void dm_softirq_done(struct request *rq)
  */
 static void dm_complete_request(struct request *rq, int error)
 {
-	struct dm_rq_target_io *tio = rq->special;
+	struct dm_rq_target_io *tio = tio_from_request(rq);
 
 	tio->error = error;
 	blk_complete_request(rq);
@@ -1247,7 +1294,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
 }
 
 /*
- * Called with the clone's queue lock held
+ * Called with the clone's queue lock held (for non-blk-mq)
  */
 static void end_clone_request(struct request *clone, int error)
 {
@@ -1808,6 +1855,18 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 
 static void map_tio_request(struct kthread_work *work);
 
+static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
+		     struct mapped_device *md)
+{
+	tio->md = md;
+	tio->ti = NULL;
+	tio->clone = NULL;
+	tio->orig = rq;
+	tio->error = 0;
+	memset(&tio->info, 0, sizeof(tio->info));
+	init_kthread_work(&tio->work, map_tio_request);
+}
+
 static struct dm_rq_target_io *prep_tio(struct request *rq,
 					struct mapped_device *md, gfp_t gfp_mask)
 {
@@ -1819,13 +1878,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
 	if (!tio)
 		return NULL;
 
-	tio->md = md;
-	tio->ti = NULL;
-	tio->clone = NULL;
-	tio->orig = rq;
-	tio->error = 0;
-	memset(&tio->info, 0, sizeof(tio->info));
-	init_kthread_work(&tio->work, map_tio_request);
+	init_tio(tio, rq, md);
 
 	table = dm_get_live_table(md, &srcu_idx);
 	if (!dm_table_mq_request_based(table)) {
@@ -1869,11 +1922,11 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
  * DM_MAPIO_REQUEUE : the original request needs to be requeued
  * < 0              : the request was completed due to failure
  */
-static int map_request(struct dm_target *ti, struct request *rq,
+static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		       struct mapped_device *md)
 {
 	int r;
-	struct dm_rq_target_io *tio = rq->special;
+	struct dm_target *ti = tio->ti;
 	struct request *clone = NULL;
 
 	if (tio->clone) {
@@ -1888,7 +1941,7 @@ static int map_request(struct dm_target *ti, struct request *rq,
 		}
 		if (IS_ERR(clone))
 			return DM_MAPIO_REQUEUE;
-		if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+		if (setup_clone(clone, rq, tio, GFP_NOIO)) {
 			/* -ENOMEM */
 			ti->type->release_clone_rq(clone);
 			return DM_MAPIO_REQUEUE;
@@ -1929,13 +1982,16 @@ static void map_tio_request(struct kthread_work *work)
 	struct request *rq = tio->orig;
 	struct mapped_device *md = tio->md;
 
-	if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
 		dm_requeue_unmapped_original_request(md, rq);
 }
 
 static void dm_start_request(struct mapped_device *md, struct request *orig)
 {
-	blk_start_request(orig);
+	if (!orig->q->mq_ops)
+		blk_start_request(orig);
+	else
+		blk_mq_start_request(orig);
 	atomic_inc(&md->pending[rq_data_dir(orig)]);
 
 	if (md->seq_rq_merge_deadline_usecs) {
@@ -2045,7 +2101,7 @@ static void dm_request_fn(struct request_queue *q)
 
 		dm_start_request(md, rq);
 
-		tio = rq->special;
+		tio = tio_from_request(rq);
 		/* Establish tio->ti before queuing work (map_tio_request) */
 		tio->ti = ti;
 		queue_kthread_work(&md->kworker, &tio->work);
@@ -2142,7 +2198,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 {
 	/*
 	 * Request-based dm devices cannot be stacked on top of bio-based dm
-	 * devices.  The type of this dm device has not been decided yet.
+	 * devices.  The type of this dm device may not have been decided yet.
 	 * The type is decided at the first table loading time.
 	 * To prevent problematic device stacking, clear the queue flag
 	 * for request stacking support until then.
@@ -2150,7 +2206,15 @@ static void dm_init_md_queue(struct mapped_device *md)
 	 * This queue is new, so no concurrency on the queue_flags.
 	 */
 	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+}
 
+static void dm_init_old_md_queue(struct mapped_device *md)
+{
+	dm_init_md_queue(md);
+
+	/*
+	 * Initialize aspects of queue that aren't relevant for blk-mq
+	 */
 	md->queue->queuedata = md;
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
@@ -2273,6 +2337,7 @@ static void unlock_fs(struct mapped_device *md);
 static void free_dev(struct mapped_device *md)
 {
 	int minor = MINOR(disk_devt(md->disk));
+	bool using_blk_mq = !!md->queue->mq_ops;
 
 	unlock_fs(md);
 	destroy_workqueue(md->wq);
@@ -2298,6 +2363,8 @@ static void free_dev(struct mapped_device *md)
 	del_gendisk(md->disk);
 	put_disk(md->disk);
 	blk_cleanup_queue(md->queue);
+	if (using_blk_mq)
+		blk_mq_free_tag_set(&md->tag_set);
 	bdput(md->bdev);
 	free_minor(minor);
 
@@ -2457,7 +2524,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	 * This must be done before setting the queue restrictions,
 	 * because request-based dm may be run just after the setting.
 	 */
-	if (dm_table_request_based(t) && !blk_queue_stopped(q))
+	if (dm_table_request_based(t))
 		stop_queue(q);
 
 	__bind_mempools(md, t);
@@ -2539,14 +2606,6 @@ unsigned dm_get_md_type(struct mapped_device *md)
 	return md->type;
 }
 
-static bool dm_md_type_request_based(struct mapped_device *md)
-{
-	unsigned table_type = dm_get_md_type(md);
-
-	return (table_type == DM_TYPE_REQUEST_BASED ||
-		table_type == DM_TYPE_MQ_REQUEST_BASED);
-}
-
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 {
 	return md->immutable_target_type;
@@ -2563,6 +2622,14 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 
+static void init_rq_based_worker_thread(struct mapped_device *md)
+{
+	/* Initialize the request-based DM worker thread */
+	init_kthread_worker(&md->kworker);
+	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+				       "kdmwork-%s", dm_device_name(md));
+}
+
 /*
  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
  */
@@ -2571,29 +2638,131 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 	struct request_queue *q = NULL;
 
 	if (md->queue->elevator)
-		return 1;
+		return 0;
 
 	/* Fully initialize the queue */
 	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
 	if (!q)
-		return 0;
+		return -EINVAL;
 
 	/* disable dm_request_fn's merge heuristic by default */
 	md->seq_rq_merge_deadline_usecs = 0;
 
 	md->queue = q;
-	dm_init_md_queue(md);
+	dm_init_old_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 
-	/* Also initialize the request-based DM worker thread */
-	init_kthread_worker(&md->kworker);
-	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
-				       "kdmwork-%s", dm_device_name(md));
+	init_rq_based_worker_thread(md);
 
 	elv_register_queue(md->queue);
 
-	return 1;
+	return 0;
+}
+
+static int dm_mq_init_request(void *data, struct request *rq,
+			      unsigned int hctx_idx, unsigned int request_idx,
+			      unsigned int numa_node)
+{
+	struct mapped_device *md = data;
+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+
+	/*
+	 * Must initialize md member of tio, otherwise it won't
+	 * be available in dm_mq_queue_rq.
+	 */
+	tio->md = md;
+
+	return 0;
+}
+
+static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+			  const struct blk_mq_queue_data *bd)
+{
+	struct request *rq = bd->rq;
+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+	struct mapped_device *md = tio->md;
+	int srcu_idx;
+	struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+	struct dm_target *ti;
+	sector_t pos;
+
+	/* always use block 0 to find the target for flushes for now */
+	pos = 0;
+	if (!(rq->cmd_flags & REQ_FLUSH))
+		pos = blk_rq_pos(rq);
+
+	ti = dm_table_find_target(map, pos);
+	if (!dm_target_is_valid(ti)) {
+		dm_put_live_table(md, srcu_idx);
+		DMERR_LIMIT("request attempted access beyond the end of device");
+		/*
+		 * Must perform setup, that rq_completed() requires,
+		 * before returning BLK_MQ_RQ_QUEUE_ERROR
+		 */
+		dm_start_request(md, rq);
+		return BLK_MQ_RQ_QUEUE_ERROR;
+	}
+	dm_put_live_table(md, srcu_idx);
+
+	if (ti->type->busy && ti->type->busy(ti))
+		return BLK_MQ_RQ_QUEUE_BUSY;
+
+	dm_start_request(md, rq);
+
+	/* Init tio using md established in .init_request */
+	init_tio(tio, rq, md);
+
+	/* Establish tio->ti before queuing work (map_tio_request) */
+	tio->ti = ti;
+	queue_kthread_work(&md->kworker, &tio->work);
+
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static struct blk_mq_ops dm_mq_ops = {
+	.queue_rq = dm_mq_queue_rq,
+	.map_queue = blk_mq_map_queue,
+	.complete = dm_softirq_done,
+	.init_request = dm_mq_init_request,
+};
+
+static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
+{
+	struct request_queue *q;
+	int err;
+
+	memset(&md->tag_set, 0, sizeof(md->tag_set));
+	md->tag_set.ops = &dm_mq_ops;
+	md->tag_set.queue_depth = BLKDEV_MAX_RQ;
+	md->tag_set.numa_node = NUMA_NO_NODE;
+	md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	md->tag_set.nr_hw_queues = 1;
+	md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
+	md->tag_set.driver_data = md;
+
+	err = blk_mq_alloc_tag_set(&md->tag_set);
+	if (err)
+		return err;
+
+	q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
+	if (IS_ERR(q)) {
+		err = PTR_ERR(q);
+		goto out_tag_set;
+	}
+	md->queue = q;
+	dm_init_md_queue(md);
+
+	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
+	blk_mq_register_disk(md->disk);
+
+	init_rq_based_worker_thread(md);
+
+	return 0;
+
+out_tag_set:
+	blk_mq_free_tag_set(&md->tag_set);
+	return err;
 }
 
 /*
@@ -2601,15 +2770,29 @@ static int dm_init_request_based_queue(struct mapped_device *md)
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
-	if (dm_md_type_request_based(md)) {
-		if (!dm_init_request_based_queue(md)) {
+	int r;
+	unsigned md_type = dm_get_md_type(md);
+
+	switch (md_type) {
+	case DM_TYPE_REQUEST_BASED:
+		r = dm_init_request_based_queue(md);
+		if (r) {
 			DMWARN("Cannot initialize queue for request-based mapped device");
-			return -EINVAL;
+			return r;
 		}
-	} else {
-		/* bio-based specific initialization */
+		break;
+	case DM_TYPE_MQ_REQUEST_BASED:
+		r = dm_init_request_based_blk_mq_queue(md);
+		if (r) {
+			DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
+			return r;
+		}
+		break;
+	case DM_TYPE_BIO_BASED:
+		dm_init_old_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
 		blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+		break;
 	}
 
 	return 0;
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 889f3a5b7b18..eac8c3641f39 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	30
+#define DM_VERSION_MINOR	31
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2014-12-22)"
+#define DM_VERSION_EXTRA	"-ioctl (2015-3-12)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */

From 022333427a8aa4ccb318a9db90cea4e69ca1826b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 10 Mar 2015 23:49:26 -0400
Subject: [PATCH 19/34] dm: optimize dm_mq_queue_rq to _not_ use kthread if
 using pure blk-mq

dm_mq_queue_rq() is in atomic context so care must be taken to not
sleep -- as such GFP_ATOMIC is used for the md->bs bioset allocations
and dm-mpath's call to blk_get_request().  In the future the bioset
allocations will hopefully go away (by removing support for partial
completions of bios in a cloned request).

Also prepare for supporting DM blk-mq ontop of old-style request_fn
device(s) if a new dm-mod 'use_blk_mq' parameter is set.  The kthread
will still be used to queue work if blk-mq is used ontop of old-style
request_fn device(s).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c |  2 +-
 drivers/md/dm.c       | 64 +++++++++++++++++++++++++++++++++----------
 2 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c8f07e5a9a17..63953477a07c 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -428,7 +428,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	} else {
 		/* blk-mq request-based interface */
 		*__clone = blk_get_request(bdev_get_queue(bdev),
-					   rq_data_dir(rq), GFP_KERNEL);
+					   rq_data_dir(rq), GFP_ATOMIC);
 		if (IS_ERR(*__clone))
 			/* ENOMEM, requeue */
 			return r;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3a66baac76ed..55cadb1a2735 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1077,9 +1077,10 @@ static void free_rq_clone(struct request *clone)
 
 	blk_rq_unprep_clone(clone);
 
-	if (clone->q && clone->q->mq_ops)
+	if (clone->q->mq_ops)
 		tio->ti->type->release_clone_rq(clone);
-	else
+	else if (!md->queue->mq_ops)
+		/* request_fn queue stacked on request_fn queue(s) */
 		free_clone_request(md, clone);
 
 	if (!md->queue->mq_ops)
@@ -1838,15 +1839,25 @@ static int setup_clone(struct request *clone, struct request *rq,
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 				struct dm_rq_target_io *tio, gfp_t gfp_mask)
 {
-	struct request *clone = alloc_clone_request(md, gfp_mask);
+	/*
+	 * Do not allocate a clone if tio->clone was already set
+	 * (see: dm_mq_queue_rq).
+	 */
+	bool alloc_clone = !tio->clone;
+	struct request *clone;
 
-	if (!clone)
-		return NULL;
+	if (alloc_clone) {
+		clone = alloc_clone_request(md, gfp_mask);
+		if (!clone)
+			return NULL;
+	} else
+		clone = tio->clone;
 
 	blk_rq_init(NULL, clone);
 	if (setup_clone(clone, rq, tio, gfp_mask)) {
 		/* -ENOMEM */
-		free_clone_request(md, clone);
+		if (alloc_clone)
+			free_clone_request(md, clone);
 		return NULL;
 	}
 
@@ -1864,7 +1875,8 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
 	tio->orig = rq;
 	tio->error = 0;
 	memset(&tio->info, 0, sizeof(tio->info));
-	init_kthread_work(&tio->work, map_tio_request);
+	if (md->kworker_task)
+		init_kthread_work(&tio->work, map_tio_request);
 }
 
 static struct dm_rq_target_io *prep_tio(struct request *rq,
@@ -1941,7 +1953,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		}
 		if (IS_ERR(clone))
 			return DM_MAPIO_REQUEUE;
-		if (setup_clone(clone, rq, tio, GFP_NOIO)) {
+		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
 			/* -ENOMEM */
 			ti->type->release_clone_rq(clone);
 			return DM_MAPIO_REQUEUE;
@@ -2408,7 +2420,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 	p->bs = NULL;
 
 out:
-	/* mempool bind completed, now no need any mempools in the table */
+	/* mempool bind completed, no longer need any mempools in the table */
 	dm_table_free_md_mempools(t);
 }
 
@@ -2713,9 +2725,24 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	/* Init tio using md established in .init_request */
 	init_tio(tio, rq, md);
 
-	/* Establish tio->ti before queuing work (map_tio_request) */
+	/*
+	 * Establish tio->ti before queuing work (map_tio_request)
+	 * or making direct call to map_request().
+	 */
 	tio->ti = ti;
-	queue_kthread_work(&md->kworker, &tio->work);
+
+	/* Clone the request if underlying devices aren't blk-mq */
+	if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
+		/* clone request is allocated at the end of the pdu */
+		tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
+		if (!clone_rq(rq, md, tio, GFP_ATOMIC))
+			return BLK_MQ_RQ_QUEUE_BUSY;
+		queue_kthread_work(&md->kworker, &tio->work);
+	} else {
+		/* Direct call is fine since .queue_rq allows allocations */
+		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
+			dm_requeue_unmapped_original_request(md, rq);
+	}
 
 	return BLK_MQ_RQ_QUEUE_OK;
 }
@@ -2729,6 +2756,7 @@ static struct blk_mq_ops dm_mq_ops = {
 
 static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
 {
+	unsigned md_type = dm_get_md_type(md);
 	struct request_queue *q;
 	int err;
 
@@ -2738,7 +2766,11 @@ static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
 	md->tag_set.numa_node = NUMA_NO_NODE;
 	md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
 	md->tag_set.nr_hw_queues = 1;
-	md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
+	if (md_type == DM_TYPE_REQUEST_BASED) {
+		/* make the memory for non-blk-mq clone part of the pdu */
+		md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
+	} else
+		md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
 	md->tag_set.driver_data = md;
 
 	err = blk_mq_alloc_tag_set(&md->tag_set);
@@ -2756,7 +2788,8 @@ static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
 	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
 	blk_mq_register_disk(md->disk);
 
-	init_rq_based_worker_thread(md);
+	if (md_type == DM_TYPE_REQUEST_BASED)
+		init_rq_based_worker_thread(md);
 
 	return 0;
 
@@ -2876,7 +2909,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 	set_bit(DMF_FREEING, &md->flags);
 	spin_unlock(&_minor_lock);
 
-	if (dm_request_based(md))
+	if (dm_request_based(md) && md->kworker_task)
 		flush_kthread_worker(&md->kworker);
 
 	/*
@@ -3130,7 +3163,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	 */
 	if (dm_request_based(md)) {
 		stop_queue(md->queue);
-		flush_kthread_worker(&md->kworker);
+		if (md->kworker_task)
+			flush_kthread_worker(&md->kworker);
 	}
 
 	flush_workqueue(md->wq);

From 17e149b8f73ba116e71e25930dd6f2eb3828792d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 11 Mar 2015 15:01:09 -0400
Subject: [PATCH 20/34] dm: add 'use_blk_mq' module param and expose in
 per-device ro sysfs attr

Request-based DM's blk-mq support defaults to off; but a user can easily
change the default using the dm_mod.use_blk_mq module/boot option.

Also, you can check what mode a given request-based DM device is using
with: cat /sys/block/dm-X/dm/use_blk_mq

This change enabled further cleanup and reduced work (e.g. the
md->io_pool and md->rq_pool isn't created if using blk-mq).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/ABI/testing/sysfs-block-dm |  8 ++++
 drivers/md/Kconfig                       | 11 +++++
 drivers/md/dm-sysfs.c                    |  9 ++++
 drivers/md/dm-table.c                    |  6 +--
 drivers/md/dm.c                          | 53 ++++++++++++++++++------
 drivers/md/dm.h                          |  5 ++-
 6 files changed, 76 insertions(+), 16 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-block-dm b/Documentation/ABI/testing/sysfs-block-dm
index ac4b6fe245d9..f9f2339b9a0a 100644
--- a/Documentation/ABI/testing/sysfs-block-dm
+++ b/Documentation/ABI/testing/sysfs-block-dm
@@ -37,3 +37,11 @@ Description:	Allow control over how long a request that is a
 		accounting.  This attribute is not applicable to
 		bio-based DM devices so it will only ever report 0 for
 		them.
+
+What:		/sys/block/dm-<num>/dm/use_blk_mq
+Date:		March 2015
+KernelVersion:	4.1
+Contact:	dm-devel@redhat.com
+Description:	Request-based Device-mapper blk-mq I/O path mode.
+		Contains the value 1 if the device is using blk-mq.
+		Otherwise it contains 0. Read-only attribute.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 63e05e32b462..109f9dcc9cab 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -196,6 +196,17 @@ config BLK_DEV_DM
 
 	  If unsure, say N.
 
+config DM_MQ_DEFAULT
+	bool "request-based DM: use blk-mq I/O path by default"
+	depends on BLK_DEV_DM
+	---help---
+	  This option enables the blk-mq based I/O path for request-based
+	  DM devices by default.  With the option the dm_mod.use_blk_mq
+	  module/boot option defaults to Y, without it to N, but it can
+	  still be overriden either way.
+
+	  If unsure say N.
+
 config DM_DEBUG
 	bool "Device mapper debugging support"
 	depends on BLK_DEV_DM
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index f5bb3944f75e..7e818f5f1dc4 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -89,15 +89,24 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
 	return strlen(buf);
 }
 
+static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
+{
+	sprintf(buf, "%d\n", dm_use_blk_mq(md));
+
+	return strlen(buf);
+}
+
 static DM_ATTR_RO(name);
 static DM_ATTR_RO(uuid);
 static DM_ATTR_RO(suspended);
+static DM_ATTR_RO(use_blk_mq);
 static DM_ATTR_RW(rq_based_seq_io_merge_deadline);
 
 static struct attribute *dm_attrs[] = {
 	&dm_attr_name.attr,
 	&dm_attr_uuid.attr,
 	&dm_attr_suspended.attr,
+	&dm_attr_use_blk_mq.attr,
 	&dm_attr_rq_based_seq_io_merge_deadline.attr,
 	NULL,
 };
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 66600cab9fa5..8d025f33de92 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -940,7 +940,7 @@ bool dm_table_mq_request_based(struct dm_table *t)
 	return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
 }
 
-static int dm_table_alloc_md_mempools(struct dm_table *t)
+static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
@@ -958,7 +958,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t)
 			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
 		}
 
-	t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size);
+	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
@@ -1128,7 +1128,7 @@ int dm_table_complete(struct dm_table *t)
 		return r;
 	}
 
-	r = dm_table_alloc_md_mempools(t);
+	r = dm_table_alloc_md_mempools(t, t->md);
 	if (r)
 		DMERR("unable to allocate mempools");
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 55cadb1a2735..944cdb322708 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -228,8 +228,20 @@ struct mapped_device {
 
 	/* for blk-mq request-based DM support */
 	struct blk_mq_tag_set tag_set;
+	bool use_blk_mq;
 };
 
+#ifdef CONFIG_DM_MQ_DEFAULT
+static bool use_blk_mq = true;
+#else
+static bool use_blk_mq = false;
+#endif
+
+bool dm_use_blk_mq(struct mapped_device *md)
+{
+	return md->use_blk_mq;
+}
+
 /*
  * For mempools pre-allocation at the table loading time.
  */
@@ -2034,7 +2046,7 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
 {
 	unsigned deadline;
 
-	if (!dm_request_based(md))
+	if (!dm_request_based(md) || md->use_blk_mq)
 		return count;
 
 	if (kstrtouint(buf, 10, &deadline))
@@ -2222,6 +2234,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 
 static void dm_init_old_md_queue(struct mapped_device *md)
 {
+	md->use_blk_mq = false;
 	dm_init_md_queue(md);
 
 	/*
@@ -2263,6 +2276,7 @@ static struct mapped_device *alloc_dev(int minor)
 	if (r < 0)
 		goto bad_io_barrier;
 
+	md->use_blk_mq = use_blk_mq;
 	md->type = DM_TYPE_NONE;
 	mutex_init(&md->suspend_lock);
 	mutex_init(&md->type_lock);
@@ -2349,7 +2363,6 @@ static void unlock_fs(struct mapped_device *md);
 static void free_dev(struct mapped_device *md)
 {
 	int minor = MINOR(disk_devt(md->disk));
-	bool using_blk_mq = !!md->queue->mq_ops;
 
 	unlock_fs(md);
 	destroy_workqueue(md->wq);
@@ -2375,7 +2388,7 @@ static void free_dev(struct mapped_device *md)
 	del_gendisk(md->disk);
 	put_disk(md->disk);
 	blk_cleanup_queue(md->queue);
-	if (using_blk_mq)
+	if (md->use_blk_mq)
 		blk_mq_free_tag_set(&md->tag_set);
 	bdput(md->bdev);
 	free_minor(minor);
@@ -2388,7 +2401,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-	if (md->io_pool && md->bs) {
+	if (md->bs) {
 		/* The md already has necessary mempools. */
 		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
 			/*
@@ -2798,13 +2811,21 @@ out_tag_set:
 	return err;
 }
 
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+	if (type == DM_TYPE_BIO_BASED)
+		return type;
+
+	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
+}
+
 /*
  * Setup the DM device's queue based on md's type
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
 	int r;
-	unsigned md_type = dm_get_md_type(md);
+	unsigned md_type = filter_md_type(dm_get_md_type(md), md);
 
 	switch (md_type) {
 	case DM_TYPE_REQUEST_BASED:
@@ -3509,16 +3530,19 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size)
 {
 	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-	struct kmem_cache *cachep;
+	struct kmem_cache *cachep = NULL;
 	unsigned int pool_size = 0;
 	unsigned int front_pad;
 
 	if (!pools)
 		return NULL;
 
+	type = filter_md_type(type, md);
+
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
 		cachep = _io_cache;
@@ -3526,13 +3550,13 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 		break;
 	case DM_TYPE_REQUEST_BASED:
+		cachep = _rq_tio_cache;
 		pool_size = dm_get_reserved_rq_based_ios();
 		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
 		if (!pools->rq_pool)
 			goto out;
 		/* fall through to setup remaining rq-based pools */
 	case DM_TYPE_MQ_REQUEST_BASED:
-		cachep = _rq_tio_cache;
 		if (!pool_size)
 			pool_size = dm_get_reserved_rq_based_ios();
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
@@ -3540,12 +3564,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		WARN_ON(per_bio_data_size != 0);
 		break;
 	default:
-		goto out;
+		BUG();
 	}
 
-	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-	if (!pools->io_pool)
-		goto out;
+	if (cachep) {
+		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
+		if (!pools->io_pool)
+			goto out;
+	}
 
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
@@ -3602,6 +3628,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
 
+module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 5522422cc6c4..6123c2bf9150 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -211,6 +211,8 @@ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 void dm_internal_suspend(struct mapped_device *md);
 void dm_internal_resume(struct mapped_device *md);
 
+bool dm_use_blk_mq(struct mapped_device *md);
+
 int dm_io_init(void);
 void dm_io_exit(void);
 
@@ -220,7 +222,8 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*

From e6e20a7a5f3f49bfee518d5c6849107398d83912 Mon Sep 17 00:00:00 2001
From: Dan Ehrenberg <dehrenberg@chromium.org>
Date: Tue, 10 Feb 2015 15:20:49 -0800
Subject: [PATCH 21/34] init: export name_to_dev_t and mark name argument as
 const

DM will switch its device lookup code to using name_to_dev_t() so it
must be exported.  Also, the @name argument should be marked const.

Signed-off-by: Dan Ehrenberg <dehrenberg@chromium.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 include/linux/mount.h | 2 +-
 init/do_mounts.c      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/mount.h b/include/linux/mount.h
index c2c561dc0114..bca086d62b1a 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -92,6 +92,6 @@ extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
 
-extern dev_t name_to_dev_t(char *name);
+extern dev_t name_to_dev_t(const char *name);
 
 #endif /* _LINUX_MOUNT_H */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index eb410083e8e0..c16adfbe4ad6 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -207,7 +207,7 @@ done:
  *	bangs.
  */
 
-dev_t name_to_dev_t(char *name)
+dev_t name_to_dev_t(const char *name)
 {
 	char s[32];
 	char *p;
@@ -286,6 +286,7 @@ fail:
 done:
 	return res;
 }
+EXPORT_SYMBOL_GPL(name_to_dev_t);
 
 static int __init root_dev_setup(char *line)
 {

From 283e7ad0241155710f99a9f39d13313a53336926 Mon Sep 17 00:00:00 2001
From: Dan Ehrenberg <dehrenberg@chromium.org>
Date: Tue, 10 Feb 2015 15:20:50 -0800
Subject: [PATCH 22/34] init: stricter checking of major:minor root= values

In the kernel command-line, previously, root=1:2jakshflaksjdhfa would
be accepted and interpreted just like root=1:2. This patch adds
stricter checking so that additional characters after major:minor are
rejected by root=.

The goal of this change is to help in unifying DM's interpretation of
its block device argument by using existing kernel code (name_to_dev_t).
But DM rejects malformed major:minor pairs, it seems reasonable for
root= to reject them as well.

Signed-off-by: Dan Ehrenberg <dehrenberg@chromium.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 init/do_mounts.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index c16adfbe4ad6..8369ffa5f33d 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -226,8 +226,9 @@ dev_t name_to_dev_t(const char *name)
 
 	if (strncmp(name, "/dev/", 5) != 0) {
 		unsigned maj, min;
+		char dummy;
 
-		if (sscanf(name, "%u:%u", &maj, &min) == 2) {
+		if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2) {
 			res = MKDEV(maj, min);
 			if (maj != MAJOR(res) || min != MINOR(res))
 				goto fail;

From 644bda6f346038bce7ad3ed48f7044c10dde6d47 Mon Sep 17 00:00:00 2001
From: Dan Ehrenberg <dehrenberg@chromium.org>
Date: Tue, 10 Feb 2015 15:20:51 -0800
Subject: [PATCH 23/34] dm table: fall back to getting device using
 name_to_dev_t()

If a device is used as the root filesystem, it can't be built
off of devices which are within the root filesystem (just like
command line arguments to root=).  For this reason, Linux has a
pseudo-filesystem for root= and MD initialization (based on the
function name_to_dev_t) which handles different ways of specifying
devices including PARTUUID and major:minor.

Switch to using name_to_dev_t() in dm_get_device().  Rather than
having DM assume that all things which are not major:minor are paths in
an already-mounted filesystem, change dm_get_device() to first attempt
to look up the device in the filesystem, and if not found it will fall
back to using name_to_dev_t().

In terms of backwards compatibility, there are some cases where
behavior will be different:
- If you have a file in the current working directory named 1:2 and
  you initialze DM there, then it will try to use that file rather
  than the disk with that major:minor pair as a backing device.
- Similarly for other bdev types which name_to_dev_t() knows how to
  interpret, the previous behavior was to repeatedly check for the
  existence of the file (e.g., while waiting for rootfs to come up)
  but the new behavior is to use the name_to_dev_t() interpretation.
  For example, if you have a file named /dev/ubiblock0_0 which is
  a symlink to /dev/sda3, but it is not yet present when DM starts
  to initialize, then the name_to_dev_t() interpretation will take
  precedence.

These incompatibilities would only show up in really strange setups
with bad practices so we shouldn't have to worry about them.

Signed-off-by: Dan Ehrenberg <dehrenberg@chromium.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8d025f33de92..e0f618b43c25 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -19,6 +19,7 @@
 #include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/blk-mq.h>
+#include <linux/mount.h>
 
 #define DM_MSG_PREFIX "table"
 
@@ -373,23 +374,18 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 	int r;
 	dev_t uninitialized_var(dev);
 	struct dm_dev_internal *dd;
-	unsigned int major, minor;
 	struct dm_table *t = ti->table;
-	char dummy;
+	struct block_device *bdev;
 
 	BUG_ON(!t);
 
-	if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
-		/* Extract the major/minor numbers */
-		dev = MKDEV(major, minor);
-		if (MAJOR(dev) != major || MINOR(dev) != minor)
-			return -EOVERFLOW;
+	/* convert the path to a device */
+	bdev = lookup_bdev(path);
+	if (IS_ERR(bdev)) {
+		dev = name_to_dev_t(path);
+		if (!dev)
+			return -ENODEV;
 	} else {
-		/* convert the path to a device */
-		struct block_device *bdev = lookup_bdev(path);
-
-		if (IS_ERR(bdev))
-			return PTR_ERR(bdev);
 		dev = bdev->bd_dev;
 		bdput(bdev);
 	}

From c32a512fdf62de260ee8298436558ea50b94dfcb Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Sun, 15 Mar 2015 13:09:10 -0400
Subject: [PATCH 24/34] dm log userspace transfer: match
 wait_for_completion_timeout return type

Return type of wait_for_completion_timeout() is unsigned long not int.
An appropriately named unsigned long is added and the assignment fixed.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-log-userspace-transfer.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 39ad9664d397..fdf8ec304f8d 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -172,6 +172,7 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
 			 char *rdata, size_t *rdata_size)
 {
 	int r = 0;
+	unsigned long tmo;
 	size_t dummy = 0;
 	int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
 	struct dm_ulog_request *tfr = prealloced_ulog_tfr;
@@ -236,11 +237,11 @@ resend:
 		goto out;
 	}
 
-	r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+	tmo = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
 	spin_lock(&receiving_list_lock);
 	list_del_init(&(pkg.list));
 	spin_unlock(&receiving_list_lock);
-	if (!r) {
+	if (!tmo) {
 		DMWARN("[%s] Request timed out: [%u/%u] - retrying",
 		       (strlen(uuid) > 8) ?
 		       (uuid + (strlen(uuid) - 8)) : (uuid),

From 18cc980ac8cf8c727d1f7d581b4576ed64bd78a6 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Wed, 18 Mar 2015 18:59:02 -0400
Subject: [PATCH 25/34] dm log userspace base: fix compile warning

This fixes up a compile warning [-Wunused-but-set-variable] - given the
comment in userspace_set_region_sync() the non-reporting of errors is
intentional so the return value can be dropped to make gcc happy.

Also, fix typo in comment.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 39fa00733431..058256d2eeea 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -741,7 +741,6 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
 static void userspace_set_region_sync(struct dm_dirty_log *log,
 				      region_t region, int in_sync)
 {
-	int r;
 	struct log_c *lc = log->context;
 	struct {
 		region_t r;
@@ -751,12 +750,12 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
 	pkg.r = region;
 	pkg.i = (int64_t)in_sync;
 
-	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
-				 (char *)&pkg, sizeof(pkg), NULL, NULL);
+	(void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+				    (char *)&pkg, sizeof(pkg), NULL, NULL);
 
 	/*
 	 * It would be nice to be able to report failures.
-	 * However, it is easy emough to detect and resolve.
+	 * However, it is easy enough to detect and resolve.
 	 */
 	return;
 }

From aca607ba242dde316794c0b607048fd5b688a520 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Tue, 17 Mar 2015 07:47:58 -0400
Subject: [PATCH 26/34] dm delay: use msecs_to_jiffies for time conversion

Converting milliseconds to jiffies by "val * HZ / 1000" is technically
OK but msecs_to_jiffies(val) is the cleaner solution and handles all
corner cases correctly.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-delay.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 42c3a27a14cc..57b6a1901c91 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -236,7 +236,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
 	delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
 
 	delayed->context = dc;
-	delayed->expires = expires = jiffies + (delay * HZ / 1000);
+	delayed->expires = expires = jiffies + msecs_to_jiffies(delay);
 
 	mutex_lock(&delayed_bios_lock);
 

From 0e0e32c16cfd2eeaf4fd4f16aa6cccd1333ce1e0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 18 Mar 2015 20:57:29 -0400
Subject: [PATCH 27/34] dm thin: remove stale 'trim' message documentation

The 'trim' message wasn't ever implemented.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/thin-provisioning.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 2f5173500bd9..4f67578b2954 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -380,9 +380,6 @@ then you'll have no access to blocks mapped beyond the end.  If you
 load a target that is bigger than before, then extra blocks will be
 provisioned as and when needed.
 
-If you wish to reduce the size of your thin device and potentially
-regain some space then send the 'trim' message to the pool.
-
 ii) Status
 
      <nr mapped sectors> <highest mapped sector>

From 65ff5b7ddf0541f2b6e5cc59c47bfbf6cbcd91b8 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 18 Mar 2015 15:52:14 +0000
Subject: [PATCH 28/34] dm verity: add error handling modes for corrupted
 blocks

Add device specific modes to dm-verity to specify how corrupted
blocks should be handled.  The following modes are defined:

  - DM_VERITY_MODE_EIO is the default behavior, where reading a
    corrupted block results in -EIO.

  - DM_VERITY_MODE_LOGGING only logs corrupted blocks, but does
    not block the read.

  - DM_VERITY_MODE_RESTART calls kernel_restart when a corrupted
    block is discovered.

In addition, each mode sends a uevent to notify userspace of
corruption and to allow further recovery actions.

The driver defaults to previous behavior (DM_VERITY_MODE_EIO)
and other modes can be enabled with an additional parameter to
the verity table.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/verity.txt |  17 +++
 drivers/md/dm-verity.c                 | 147 +++++++++++++++++++++++--
 drivers/md/dm.c                        |   1 +
 3 files changed, 153 insertions(+), 12 deletions(-)

diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
index 9884681535ee..64ccc5a079a5 100644
--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt
@@ -11,6 +11,7 @@ Construction Parameters
     <data_block_size> <hash_block_size>
     <num_data_blocks> <hash_start_block>
     <algorithm> <digest> <salt>
+    [<#opt_params> <opt_params>]
 
 <version>
     This is the type of the on-disk hash format.
@@ -62,6 +63,22 @@ Construction Parameters
 <salt>
     The hexadecimal encoding of the salt value.
 
+<#opt_params>
+    Number of optional parameters. If there are no optional parameters,
+    the optional paramaters section can be skipped or #opt_params can be zero.
+    Otherwise #opt_params is the number of following arguments.
+
+    Example of optional parameters section:
+        1 ignore_corruption
+
+ignore_corruption
+    Log corrupted blocks, but allow read operations to proceed normally.
+
+restart_on_corruption
+    Restart the system when a corrupted block is discovered. This option is
+    not compatible with ignore_corruption and requires user space support to
+    avoid restart loops.
+
 Theory of operation
 ===================
 
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 7a7bab8947ae..66616db33e6f 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -18,20 +18,39 @@
 
 #include <linux/module.h>
 #include <linux/device-mapper.h>
+#include <linux/reboot.h>
 #include <crypto/hash.h>
 
 #define DM_MSG_PREFIX			"verity"
 
+#define DM_VERITY_ENV_LENGTH		42
+#define DM_VERITY_ENV_VAR_NAME		"DM_VERITY_ERR_BLOCK_NR"
+
 #define DM_VERITY_IO_VEC_INLINE		16
 #define DM_VERITY_MEMPOOL_SIZE		4
 #define DM_VERITY_DEFAULT_PREFETCH_SIZE	262144
 
 #define DM_VERITY_MAX_LEVELS		63
+#define DM_VERITY_MAX_CORRUPTED_ERRS	100
+
+#define DM_VERITY_OPT_LOGGING		"ignore_corruption"
+#define DM_VERITY_OPT_RESTART		"restart_on_corruption"
 
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 
 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
 
+enum verity_mode {
+	DM_VERITY_MODE_EIO,
+	DM_VERITY_MODE_LOGGING,
+	DM_VERITY_MODE_RESTART
+};
+
+enum verity_block_type {
+	DM_VERITY_BLOCK_TYPE_DATA,
+	DM_VERITY_BLOCK_TYPE_METADATA
+};
+
 struct dm_verity {
 	struct dm_dev *data_dev;
 	struct dm_dev *hash_dev;
@@ -54,6 +73,8 @@ struct dm_verity {
 	unsigned digest_size;	/* digest size for the current hash algorithm */
 	unsigned shash_descsize;/* the size of temporary space for crypto */
 	int hash_failed;	/* set to 1 if hash of any block failed */
+	enum verity_mode mode;	/* mode for handling verification errors */
+	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
 
 	mempool_t *vec_mempool;	/* mempool of bio vector */
 
@@ -174,6 +195,57 @@ static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
 		*offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
 }
 
+/*
+ * Handle verification errors.
+ */
+static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
+			     unsigned long long block)
+{
+	char verity_env[DM_VERITY_ENV_LENGTH];
+	char *envp[] = { verity_env, NULL };
+	const char *type_str = "";
+	struct mapped_device *md = dm_table_get_md(v->ti->table);
+
+	/* Corruption should be visible in device status in all modes */
+	v->hash_failed = 1;
+
+	if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS)
+		goto out;
+
+	v->corrupted_errs++;
+
+	switch (type) {
+	case DM_VERITY_BLOCK_TYPE_DATA:
+		type_str = "data";
+		break;
+	case DM_VERITY_BLOCK_TYPE_METADATA:
+		type_str = "metadata";
+		break;
+	default:
+		BUG();
+	}
+
+	DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
+		block);
+
+	if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
+		DMERR("%s: reached maximum errors", v->data_dev->name);
+
+	snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu",
+		DM_VERITY_ENV_VAR_NAME, type, block);
+
+	kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp);
+
+out:
+	if (v->mode == DM_VERITY_MODE_LOGGING)
+		return 0;
+
+	if (v->mode == DM_VERITY_MODE_RESTART)
+		kernel_restart("dm-verity device corrupted");
+
+	return 1;
+}
+
 /*
  * Verify hash of a metadata block pertaining to the specified data block
  * ("block" argument) at a specified level ("level" argument).
@@ -251,11 +323,11 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
 			goto release_ret_r;
 		}
 		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-			DMERR_LIMIT("metadata block %llu is corrupted",
-				(unsigned long long)hash_block);
-			v->hash_failed = 1;
-			r = -EIO;
-			goto release_ret_r;
+			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
+					      hash_block)) {
+				r = -EIO;
+				goto release_ret_r;
+			}
 		} else
 			aux->hash_verified = 1;
 	}
@@ -367,10 +439,9 @@ test_block_hash:
 			return r;
 		}
 		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-			DMERR_LIMIT("data block %llu is corrupted",
-				(unsigned long long)(io->block + b));
-			v->hash_failed = 1;
-			return -EIO;
+			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+					      io->block + b))
+				return -EIO;
 		}
 	}
 
@@ -546,6 +617,19 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 		else
 			for (x = 0; x < v->salt_size; x++)
 				DMEMIT("%02x", v->salt[x]);
+		if (v->mode != DM_VERITY_MODE_EIO) {
+			DMEMIT(" 1 ");
+			switch (v->mode) {
+			case DM_VERITY_MODE_LOGGING:
+				DMEMIT(DM_VERITY_OPT_LOGGING);
+				break;
+			case DM_VERITY_MODE_RESTART:
+				DMEMIT(DM_VERITY_OPT_RESTART);
+				break;
+			default:
+				BUG();
+			}
+		}
 		break;
 	}
 }
@@ -647,13 +731,19 @@ static void verity_dtr(struct dm_target *ti)
 static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	struct dm_verity *v;
-	unsigned num;
+	struct dm_arg_set as;
+	const char *opt_string;
+	unsigned int num, opt_params;
 	unsigned long long num_ll;
 	int r;
 	int i;
 	sector_t hash_position;
 	char dummy;
 
+	static struct dm_arg _args[] = {
+		{0, 1, "Invalid number of feature args"},
+	};
+
 	v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
 	if (!v) {
 		ti->error = "Cannot allocate verity structure";
@@ -668,8 +758,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
-	if (argc != 10) {
-		ti->error = "Invalid argument count: exactly 10 arguments required";
+	if (argc < 10) {
+		ti->error = "Not enough arguments";
 		r = -EINVAL;
 		goto bad;
 	}
@@ -790,6 +880,39 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		}
 	}
 
+	argv += 10;
+	argc -= 10;
+
+	/* Optional parameters */
+	if (argc) {
+		as.argc = argc;
+		as.argv = argv;
+
+		r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+		if (r)
+			goto bad;
+
+		while (opt_params) {
+			opt_params--;
+			opt_string = dm_shift_arg(&as);
+			if (!opt_string) {
+				ti->error = "Not enough feature arguments";
+				r = -EINVAL;
+				goto bad;
+			}
+
+			if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
+				v->mode = DM_VERITY_MODE_LOGGING;
+			else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
+				v->mode = DM_VERITY_MODE_RESTART;
+			else {
+				ti->error = "Invalid feature arguments";
+				r = -EINVAL;
+				goto bad;
+			}
+		}
+	}
+
 	v->hash_per_block_bits =
 		__fls((1 << v->hash_dev_block_bits) / v->digest_size);
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 944cdb322708..f8c7ca3e8947 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -3483,6 +3483,7 @@ struct gendisk *dm_disk(struct mapped_device *md)
 {
 	return md->disk;
 }
+EXPORT_SYMBOL_GPL(dm_disk);
 
 struct kobject *dm_kobject(struct mapped_device *md)
 {

From 7f61f5a022101e0c38c3cff2ef9ace9c9c86dbfb Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 30 Mar 2015 10:43:18 -0700
Subject: [PATCH 29/34] dm table: use bool function return values of true/false
 not 1/0

Use the normal return values for bool functions.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e0f618b43c25..d9b00b8565c6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1336,14 +1336,14 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
 			continue;
 
 		if (ti->flush_supported)
-			return 1;
+			return true;
 
 		if (ti->type->iterate_devices &&
 		    ti->type->iterate_devices(ti, device_flush_capable, &flush))
-			return 1;
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
 static bool dm_table_discard_zeroes_data(struct dm_table *t)
@@ -1356,10 +1356,10 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t)
 		ti = dm_table_get_target(t, i++);
 
 		if (ti->discard_zeroes_data_unsupported)
-			return 0;
+			return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
@@ -1405,10 +1405,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 
 		if (!ti->type->iterate_devices ||
 		    !ti->type->iterate_devices(ti, func, NULL))
-			return 0;
+			return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
@@ -1465,14 +1465,14 @@ static bool dm_table_supports_discards(struct dm_table *t)
 			continue;
 
 		if (ti->discards_supported)
-			return 1;
+			return true;
 
 		if (ti->type->iterate_devices &&
 		    ti->type->iterate_devices(ti, device_discard_capable, NULL))
-			return 1;
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,

From 0e9cebe724597a76ab1b0ebc0a21e16f7db11b47 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 20 Mar 2015 10:50:37 -0400
Subject: [PATCH 30/34] dm: add log writes target

Introduce a new target that is meant for file system developers to test file
system integrity at particular points in the life of a file system.  We capture
all write requests and associated data and log them to a separate device
for later replay.  There is a userspace utility to do this replay.  The
idea behind this is to give file system developers a tool to verify that
the file system is always consistent.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Zach Brown <zab@zabbo.net>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/log-writes.txt | 140 ++++
 drivers/md/Kconfig                         |  16 +
 drivers/md/Makefile                        |   1 +
 drivers/md/dm-log-writes.c                 | 825 +++++++++++++++++++++
 4 files changed, 982 insertions(+)
 create mode 100644 Documentation/device-mapper/log-writes.txt
 create mode 100644 drivers/md/dm-log-writes.c

diff --git a/Documentation/device-mapper/log-writes.txt b/Documentation/device-mapper/log-writes.txt
new file mode 100644
index 000000000000..c10f30c9b534
--- /dev/null
+++ b/Documentation/device-mapper/log-writes.txt
@@ -0,0 +1,140 @@
+dm-log-writes
+=============
+
+This target takes 2 devices, one to pass all IO to normally, and one to log all
+of the write operations to.  This is intended for file system developers wishing
+to verify the integrity of metadata or data as the file system is written to.
+There is a log_write_entry written for every WRITE request and the target is
+able to take arbitrary data from userspace to insert into the log.  The data
+that is in the WRITE requests is copied into the log to make the replay happen
+exactly as it happened originally.
+
+Log Ordering
+============
+
+We log things in order of completion once we are sure the write is no longer in
+cache.  This means that normal WRITE requests are not actually logged until the
+next REQ_FLUSH request.  This is to make it easier for userspace to replay the
+log in a way that correlates to what is on disk and not what is in cache, to
+make it easier to detect improper waiting/flushing.
+
+This works by attaching all WRITE requests to a list once the write completes.
+Once we see a REQ_FLUSH request we splice this list onto the request and once
+the FLUSH request completes we log all of the WRITEs and then the FLUSH.  Only
+completed WRITEs, at the time the REQ_FLUSH is issued, are added in order to
+simulate the worst case scenario with regard to power failures.  Consider the
+following example (W means write, C means complete):
+
+W1,W2,W3,C3,C2,Wflush,C1,Cflush
+
+The log would show the following
+
+W3,W2,flush,W1....
+
+Again this is to simulate what is actually on disk, this allows us to detect
+cases where a power failure at a particular point in time would create an
+inconsistent file system.
+
+Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as
+they complete as those requests will obviously bypass the device cache.
+
+Any REQ_DISCARD requests are treated like WRITE requests.  Otherwise we would
+have all the DISCARD requests, and then the WRITE requests and then the FLUSH
+request.  Consider the following example:
+
+WRITE block 1, DISCARD block 1, FLUSH
+
+If we logged DISCARD when it completed, the replay would look like this
+
+DISCARD 1, WRITE 1, FLUSH
+
+which isn't quite what happened and wouldn't be caught during the log replay.
+
+Target interface
+================
+
+i) Constructor
+
+   log-writes <dev_path> <log_dev_path>
+
+   dev_path	: Device that all of the IO will go to normally.
+   log_dev_path : Device where the log entries are written to.
+
+ii) Status
+
+    <#logged entries> <highest allocated sector>
+
+    #logged entries	       : Number of logged entries
+    highest allocated sector   : Highest allocated sector
+
+iii) Messages
+
+    mark <description>
+
+	You can use a dmsetup message to set an arbitrary mark in a log.
+	For example say you want to fsck a file system after every
+	write, but first you need to replay up to the mkfs to make sure
+	we're fsck'ing something reasonable, you would do something like
+	this:
+
+	  mkfs.btrfs -f /dev/mapper/log
+	  dmsetup message log 0 mark mkfs
+	  <run test>
+
+	  This would allow you to replay the log up to the mkfs mark and
+	  then replay from that point on doing the fsck check in the
+	  interval that you want.
+
+	Every log has a mark at the end labeled "dm-log-writes-end".
+
+Userspace component
+===================
+
+There is a userspace tool that will replay the log for you in various ways.
+It can be found here: https://github.com/josefbacik/log-writes
+
+Example usage
+=============
+
+Say you want to test fsync on your file system.  You would do something like
+this:
+
+TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
+dmsetup create log --table "$TABLE"
+mkfs.btrfs -f /dev/mapper/log
+dmsetup message log 0 mark mkfs
+
+mount /dev/mapper/log /mnt/btrfs-test
+<some test that does fsync at the end>
+dmsetup message log 0 mark fsync
+md5sum /mnt/btrfs-test/foo
+umount /mnt/btrfs-test
+
+dmsetup remove log
+replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync
+mount /dev/sdb /mnt/btrfs-test
+md5sum /mnt/btrfs-test/foo
+<verify md5sum's are correct>
+
+Another option is to do a complicated file system operation and verify the file
+system is consistent during the entire operation.  You could do this with:
+
+TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
+dmsetup create log --table "$TABLE"
+mkfs.btrfs -f /dev/mapper/log
+dmsetup message log 0 mark mkfs
+
+mount /dev/mapper/log /mnt/btrfs-test
+<fsstress to dirty the fs>
+btrfs filesystem balance /mnt/btrfs-test
+umount /mnt/btrfs-test
+dmsetup remove log
+
+replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs
+btrfsck /dev/sdb
+replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \
+	--fsck "btrfsck /dev/sdb" --check fua
+
+And that will replay the log until it sees a FUA request, run the fsck command
+and if the fsck passes it will replay to the next FUA, until it is completed or
+the fsck command exists abnormally.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 109f9dcc9cab..6ddc983417d5 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -443,4 +443,20 @@ config DM_SWITCH
 
 	  If unsure, say N.
 
+config DM_LOG_WRITES
+	tristate "Log writes target support"
+	depends on BLK_DEV_DM
+	---help---
+	  This device-mapper target takes two devices, one device to use
+	  normally, one to log all write operations done to the first device.
+	  This is for use by file system developers wishing to verify that
+	  their fs is writing a consitent file system at all times by allowing
+	  them to replay the log in a variety of ways and to check the
+	  contents.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called dm-log-writes.
+
+	  If unsure, say N.
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a2da532b1c2b..1863feaa5846 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
+obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
new file mode 100644
index 000000000000..93e08446a87d
--- /dev/null
+++ b/drivers/md/dm-log-writes.c
@@ -0,0 +1,825 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#define DM_MSG_PREFIX "log-writes"
+
+/*
+ * This target will sequentially log all writes to the target device onto the
+ * log device.  This is helpful for replaying writes to check for fs consistency
+ * at all times.  This target provides a mechanism to mark specific events to
+ * check data at a later time.  So for example you would:
+ *
+ * write data
+ * fsync
+ * dmsetup message /dev/whatever mark mymark
+ * unmount /mnt/test
+ *
+ * Then replay the log up to mymark and check the contents of the replay to
+ * verify it matches what was written.
+ *
+ * We log writes only after they have been flushed, this makes the log describe
+ * close to the order in which the data hits the actual disk, not its cache.  So
+ * for example the following sequence (W means write, C means complete)
+ *
+ * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
+ *
+ * Would result in the log looking like this:
+ *
+ * c,a,flush,fuad,b,<other writes>,<next flush>
+ *
+ * This is meant to help expose problems where file systems do not properly wait
+ * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
+ * completes it is added to the log as it should be on disk.
+ *
+ * We treat DISCARDs as if they don't bypass cache so that they are logged in
+ * order of completion along with the normal writes.  If we didn't do it this
+ * way we would process all the discards first and then write all the data, when
+ * in fact we want to do the data and the discard in the order that they
+ * completed.
+ */
+#define LOG_FLUSH_FLAG (1 << 0)
+#define LOG_FUA_FLAG (1 << 1)
+#define LOG_DISCARD_FLAG (1 << 2)
+#define LOG_MARK_FLAG (1 << 3)
+
+#define WRITE_LOG_VERSION 1
+#define WRITE_LOG_MAGIC 0x6a736677736872
+
+/*
+ * The disk format for this is braindead simple.
+ *
+ * At byte 0 we have our super, followed by the following sequence for
+ * nr_entries:
+ *
+ * [   1 sector    ][  entry->nr_sectors ]
+ * [log_write_entry][    data written    ]
+ *
+ * The log_write_entry takes up a full sector so we can have arbitrary length
+ * marks and it leaves us room for extra content in the future.
+ */
+
+/*
+ * Basic info about the log for userspace.
+ */
+struct log_write_super {
+	__le64 magic;
+	__le64 version;
+	__le64 nr_entries;
+	__le32 sectorsize;
+};
+
+/*
+ * sector - the sector we wrote.
+ * nr_sectors - the number of sectors we wrote.
+ * flags - flags for this log entry.
+ * data_len - the size of the data in this log entry, this is for private log
+ * entry stuff, the MARK data provided by userspace for example.
+ */
+struct log_write_entry {
+	__le64 sector;
+	__le64 nr_sectors;
+	__le64 flags;
+	__le64 data_len;
+};
+
+struct log_writes_c {
+	struct dm_dev *dev;
+	struct dm_dev *logdev;
+	u64 logged_entries;
+	u32 sectorsize;
+	atomic_t io_blocks;
+	atomic_t pending_blocks;
+	sector_t next_sector;
+	sector_t end_sector;
+	bool logging_enabled;
+	bool device_supports_discard;
+	spinlock_t blocks_lock;
+	struct list_head unflushed_blocks;
+	struct list_head logging_blocks;
+	wait_queue_head_t wait;
+	struct task_struct *log_kthread;
+};
+
+struct pending_block {
+	int vec_cnt;
+	u64 flags;
+	sector_t sector;
+	sector_t nr_sectors;
+	char *data;
+	u32 datalen;
+	struct list_head list;
+	struct bio_vec vecs[0];
+};
+
+struct per_bio_data {
+	struct pending_block *block;
+};
+
+static void put_pending_block(struct log_writes_c *lc)
+{
+	if (atomic_dec_and_test(&lc->pending_blocks)) {
+		smp_mb__after_atomic();
+		if (waitqueue_active(&lc->wait))
+			wake_up(&lc->wait);
+	}
+}
+
+static void put_io_block(struct log_writes_c *lc)
+{
+	if (atomic_dec_and_test(&lc->io_blocks)) {
+		smp_mb__after_atomic();
+		if (waitqueue_active(&lc->wait))
+			wake_up(&lc->wait);
+	}
+}
+
+static void log_end_io(struct bio *bio, int err)
+{
+	struct log_writes_c *lc = bio->bi_private;
+	struct bio_vec *bvec;
+	int i;
+
+	if (err) {
+		unsigned long flags;
+
+		DMERR("Error writing log block, error=%d", err);
+		spin_lock_irqsave(&lc->blocks_lock, flags);
+		lc->logging_enabled = false;
+		spin_unlock_irqrestore(&lc->blocks_lock, flags);
+	}
+
+	bio_for_each_segment_all(bvec, bio, i)
+		__free_page(bvec->bv_page);
+
+	put_io_block(lc);
+	bio_put(bio);
+}
+
+/*
+ * Meant to be called if there is an error, it will free all the pages
+ * associated with the block.
+ */
+static void free_pending_block(struct log_writes_c *lc,
+			       struct pending_block *block)
+{
+	int i;
+
+	for (i = 0; i < block->vec_cnt; i++) {
+		if (block->vecs[i].bv_page)
+			__free_page(block->vecs[i].bv_page);
+	}
+	kfree(block->data);
+	kfree(block);
+	put_pending_block(lc);
+}
+
+static int write_metadata(struct log_writes_c *lc, void *entry,
+			  size_t entrylen, void *data, size_t datalen,
+			  sector_t sector)
+{
+	struct bio *bio;
+	struct page *page;
+	void *ptr;
+	size_t ret;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (!bio) {
+		DMERR("Couldn't alloc log bio");
+		goto error;
+	}
+	bio->bi_iter.bi_size = 0;
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_bdev = lc->logdev->bdev;
+	bio->bi_end_io = log_end_io;
+	bio->bi_private = lc;
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		DMERR("Couldn't alloc log page");
+		bio_put(bio);
+		goto error;
+	}
+
+	ptr = kmap_atomic(page);
+	memcpy(ptr, entry, entrylen);
+	if (datalen)
+		memcpy(ptr + entrylen, data, datalen);
+	memset(ptr + entrylen + datalen, 0,
+	       lc->sectorsize - entrylen - datalen);
+	kunmap_atomic(ptr);
+
+	ret = bio_add_page(bio, page, lc->sectorsize, 0);
+	if (ret != lc->sectorsize) {
+		DMERR("Couldn't add page to the log block");
+		goto error_bio;
+	}
+	submit_bio(WRITE, bio);
+	return 0;
+error_bio:
+	bio_put(bio);
+	__free_page(page);
+error:
+	put_io_block(lc);
+	return -1;
+}
+
+static int log_one_block(struct log_writes_c *lc,
+			 struct pending_block *block, sector_t sector)
+{
+	struct bio *bio;
+	struct log_write_entry entry;
+	size_t ret;
+	int i;
+
+	entry.sector = cpu_to_le64(block->sector);
+	entry.nr_sectors = cpu_to_le64(block->nr_sectors);
+	entry.flags = cpu_to_le64(block->flags);
+	entry.data_len = cpu_to_le64(block->datalen);
+	if (write_metadata(lc, &entry, sizeof(entry), block->data,
+			   block->datalen, sector)) {
+		free_pending_block(lc, block);
+		return -1;
+	}
+
+	if (!block->vec_cnt)
+		goto out;
+	sector++;
+
+	bio = bio_alloc(GFP_KERNEL, block->vec_cnt);
+	if (!bio) {
+		DMERR("Couldn't alloc log bio");
+		goto error;
+	}
+	atomic_inc(&lc->io_blocks);
+	bio->bi_iter.bi_size = 0;
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_bdev = lc->logdev->bdev;
+	bio->bi_end_io = log_end_io;
+	bio->bi_private = lc;
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	for (i = 0; i < block->vec_cnt; i++) {
+		/*
+		 * The page offset is always 0 because we allocate a new page
+		 * for every bvec in the original bio for simplicity sake.
+		 */
+		ret = bio_add_page(bio, block->vecs[i].bv_page,
+				   block->vecs[i].bv_len, 0);
+		if (ret != block->vecs[i].bv_len) {
+			atomic_inc(&lc->io_blocks);
+			submit_bio(WRITE, bio);
+			bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i);
+			if (!bio) {
+				DMERR("Couldn't alloc log bio");
+				goto error;
+			}
+			bio->bi_iter.bi_size = 0;
+			bio->bi_iter.bi_sector = sector;
+			bio->bi_bdev = lc->logdev->bdev;
+			bio->bi_end_io = log_end_io;
+			bio->bi_private = lc;
+			set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+			ret = bio_add_page(bio, block->vecs[i].bv_page,
+					   block->vecs[i].bv_len, 0);
+			if (ret != block->vecs[i].bv_len) {
+				DMERR("Couldn't add page on new bio?");
+				bio_put(bio);
+				goto error;
+			}
+		}
+		sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
+	}
+	submit_bio(WRITE, bio);
+out:
+	kfree(block->data);
+	kfree(block);
+	put_pending_block(lc);
+	return 0;
+error:
+	free_pending_block(lc, block);
+	put_io_block(lc);
+	return -1;
+}
+
+static int log_super(struct log_writes_c *lc)
+{
+	struct log_write_super super;
+
+	super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
+	super.version = cpu_to_le64(WRITE_LOG_VERSION);
+	super.nr_entries = cpu_to_le64(lc->logged_entries);
+	super.sectorsize = cpu_to_le32(lc->sectorsize);
+
+	if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
+		DMERR("Couldn't write super");
+		return -1;
+	}
+
+	return 0;
+}
+
+static inline sector_t logdev_last_sector(struct log_writes_c *lc)
+{
+	return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static int log_writes_kthread(void *arg)
+{
+	struct log_writes_c *lc = (struct log_writes_c *)arg;
+	sector_t sector = 0;
+
+	while (!kthread_should_stop()) {
+		bool super = false;
+		bool logging_enabled;
+		struct pending_block *block = NULL;
+		int ret;
+
+		spin_lock_irq(&lc->blocks_lock);
+		if (!list_empty(&lc->logging_blocks)) {
+			block = list_first_entry(&lc->logging_blocks,
+						 struct pending_block, list);
+			list_del_init(&block->list);
+			if (!lc->logging_enabled)
+				goto next;
+
+			sector = lc->next_sector;
+			if (block->flags & LOG_DISCARD_FLAG)
+				lc->next_sector++;
+			else
+				lc->next_sector += block->nr_sectors + 1;
+
+			/*
+			 * Apparently the size of the device may not be known
+			 * right away, so handle this properly.
+			 */
+			if (!lc->end_sector)
+				lc->end_sector = logdev_last_sector(lc);
+			if (lc->end_sector &&
+			    lc->next_sector >= lc->end_sector) {
+				DMERR("Ran out of space on the logdev");
+				lc->logging_enabled = false;
+				goto next;
+			}
+			lc->logged_entries++;
+			atomic_inc(&lc->io_blocks);
+
+			super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
+			if (super)
+				atomic_inc(&lc->io_blocks);
+		}
+next:
+		logging_enabled = lc->logging_enabled;
+		spin_unlock_irq(&lc->blocks_lock);
+		if (block) {
+			if (logging_enabled) {
+				ret = log_one_block(lc, block, sector);
+				if (!ret && super)
+					ret = log_super(lc);
+				if (ret) {
+					spin_lock_irq(&lc->blocks_lock);
+					lc->logging_enabled = false;
+					spin_unlock_irq(&lc->blocks_lock);
+				}
+			} else
+				free_pending_block(lc, block);
+			continue;
+		}
+
+		if (!try_to_freeze()) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (!kthread_should_stop() &&
+			    !atomic_read(&lc->pending_blocks))
+				schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Construct a log-writes mapping:
+ * log-writes <dev_path> <log_dev_path>
+ */
+static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct log_writes_c *lc;
+	struct dm_arg_set as;
+	const char *devname, *logdevname;
+
+	as.argc = argc;
+	as.argv = argv;
+
+	if (argc < 2) {
+		ti->error = "Invalid argument count";
+		return -EINVAL;
+	}
+
+	lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
+	if (!lc) {
+		ti->error = "Cannot allocate context";
+		return -ENOMEM;
+	}
+	spin_lock_init(&lc->blocks_lock);
+	INIT_LIST_HEAD(&lc->unflushed_blocks);
+	INIT_LIST_HEAD(&lc->logging_blocks);
+	init_waitqueue_head(&lc->wait);
+	lc->sectorsize = 1 << SECTOR_SHIFT;
+	atomic_set(&lc->io_blocks, 0);
+	atomic_set(&lc->pending_blocks, 0);
+
+	devname = dm_shift_arg(&as);
+	if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) {
+		ti->error = "Device lookup failed";
+		goto bad;
+	}
+
+	logdevname = dm_shift_arg(&as);
+	if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) {
+		ti->error = "Log device lookup failed";
+		dm_put_device(ti, lc->dev);
+		goto bad;
+	}
+
+	lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
+	if (!lc->log_kthread) {
+		ti->error = "Couldn't alloc kthread";
+		dm_put_device(ti, lc->dev);
+		dm_put_device(ti, lc->logdev);
+		goto bad;
+	}
+
+	/* We put the super at sector 0, start logging at sector 1 */
+	lc->next_sector = 1;
+	lc->logging_enabled = true;
+	lc->end_sector = logdev_last_sector(lc);
+	lc->device_supports_discard = true;
+
+	ti->num_flush_bios = 1;
+	ti->flush_supported = true;
+	ti->num_discard_bios = 1;
+	ti->discards_supported = true;
+	ti->per_bio_data_size = sizeof(struct per_bio_data);
+	ti->private = lc;
+	return 0;
+
+bad:
+	kfree(lc);
+	return -EINVAL;
+}
+
+static int log_mark(struct log_writes_c *lc, char *data)
+{
+	struct pending_block *block;
+	size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
+
+	block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
+	if (!block) {
+		DMERR("Error allocating pending block");
+		return -ENOMEM;
+	}
+
+	block->data = kstrndup(data, maxsize, GFP_KERNEL);
+	if (!block->data) {
+		DMERR("Error copying mark data");
+		kfree(block);
+		return -ENOMEM;
+	}
+	atomic_inc(&lc->pending_blocks);
+	block->datalen = strlen(block->data);
+	block->flags |= LOG_MARK_FLAG;
+	spin_lock_irq(&lc->blocks_lock);
+	list_add_tail(&block->list, &lc->logging_blocks);
+	spin_unlock_irq(&lc->blocks_lock);
+	wake_up_process(lc->log_kthread);
+	return 0;
+}
+
+static void log_writes_dtr(struct dm_target *ti)
+{
+	struct log_writes_c *lc = ti->private;
+
+	spin_lock_irq(&lc->blocks_lock);
+	list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
+	spin_unlock_irq(&lc->blocks_lock);
+
+	/*
+	 * This is just nice to have since it'll update the super to include the
+	 * unflushed blocks, if it fails we don't really care.
+	 */
+	log_mark(lc, "dm-log-writes-end");
+	wake_up_process(lc->log_kthread);
+	wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
+		   !atomic_read(&lc->pending_blocks));
+	kthread_stop(lc->log_kthread);
+
+	WARN_ON(!list_empty(&lc->logging_blocks));
+	WARN_ON(!list_empty(&lc->unflushed_blocks));
+	dm_put_device(ti, lc->dev);
+	dm_put_device(ti, lc->logdev);
+	kfree(lc);
+}
+
+static void normal_map_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct log_writes_c *lc = ti->private;
+
+	bio->bi_bdev = lc->dev->bdev;
+}
+
+static int log_writes_map(struct dm_target *ti, struct bio *bio)
+{
+	struct log_writes_c *lc = ti->private;
+	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+	struct pending_block *block;
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	size_t alloc_size;
+	int i = 0;
+	bool flush_bio = (bio->bi_rw & REQ_FLUSH);
+	bool fua_bio = (bio->bi_rw & REQ_FUA);
+	bool discard_bio = (bio->bi_rw & REQ_DISCARD);
+
+	pb->block = NULL;
+
+	/* Don't bother doing anything if logging has been disabled */
+	if (!lc->logging_enabled)
+		goto map_bio;
+
+	/*
+	 * Map reads as normal.
+	 */
+	if (bio_data_dir(bio) == READ)
+		goto map_bio;
+
+	/* No sectors and not a flush?  Don't care */
+	if (!bio_sectors(bio) && !flush_bio)
+		goto map_bio;
+
+	/*
+	 * Discards will have bi_size set but there's no actual data, so just
+	 * allocate the size of the pending block.
+	 */
+	if (discard_bio)
+		alloc_size = sizeof(struct pending_block);
+	else
+		alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
+
+	block = kzalloc(alloc_size, GFP_NOIO);
+	if (!block) {
+		DMERR("Error allocating pending block");
+		spin_lock_irq(&lc->blocks_lock);
+		lc->logging_enabled = false;
+		spin_unlock_irq(&lc->blocks_lock);
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&block->list);
+	pb->block = block;
+	atomic_inc(&lc->pending_blocks);
+
+	if (flush_bio)
+		block->flags |= LOG_FLUSH_FLAG;
+	if (fua_bio)
+		block->flags |= LOG_FUA_FLAG;
+	if (discard_bio)
+		block->flags |= LOG_DISCARD_FLAG;
+
+	block->sector = bio->bi_iter.bi_sector;
+	block->nr_sectors = bio_sectors(bio);
+
+	/* We don't need the data, just submit */
+	if (discard_bio) {
+		WARN_ON(flush_bio || fua_bio);
+		if (lc->device_supports_discard)
+			goto map_bio;
+		bio_endio(bio, 0);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	/* Flush bio, splice the unflushed blocks onto this list and submit */
+	if (flush_bio && !bio_sectors(bio)) {
+		spin_lock_irq(&lc->blocks_lock);
+		list_splice_init(&lc->unflushed_blocks, &block->list);
+		spin_unlock_irq(&lc->blocks_lock);
+		goto map_bio;
+	}
+
+	/*
+	 * We will write this bio somewhere else way later so we need to copy
+	 * the actual contents into new pages so we know the data will always be
+	 * there.
+	 *
+	 * We do this because this could be a bio from O_DIRECT in which case we
+	 * can't just hold onto the page until some later point, we have to
+	 * manually copy the contents.
+	 */
+	bio_for_each_segment(bv, bio, iter) {
+		struct page *page;
+		void *src, *dst;
+
+		page = alloc_page(GFP_NOIO);
+		if (!page) {
+			DMERR("Error allocing page");
+			free_pending_block(lc, block);
+			spin_lock_irq(&lc->blocks_lock);
+			lc->logging_enabled = false;
+			spin_unlock_irq(&lc->blocks_lock);
+			return -ENOMEM;
+		}
+
+		src = kmap_atomic(bv.bv_page);
+		dst = kmap_atomic(page);
+		memcpy(dst, src + bv.bv_offset, bv.bv_len);
+		kunmap_atomic(dst);
+		kunmap_atomic(src);
+		block->vecs[i].bv_page = page;
+		block->vecs[i].bv_len = bv.bv_len;
+		block->vec_cnt++;
+		i++;
+	}
+
+	/* Had a flush with data in it, weird */
+	if (flush_bio) {
+		spin_lock_irq(&lc->blocks_lock);
+		list_splice_init(&lc->unflushed_blocks, &block->list);
+		spin_unlock_irq(&lc->blocks_lock);
+	}
+map_bio:
+	normal_map_bio(ti, bio);
+	return DM_MAPIO_REMAPPED;
+}
+
+static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+	struct log_writes_c *lc = ti->private;
+	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+
+	if (bio_data_dir(bio) == WRITE && pb->block) {
+		struct pending_block *block = pb->block;
+		unsigned long flags;
+
+		spin_lock_irqsave(&lc->blocks_lock, flags);
+		if (block->flags & LOG_FLUSH_FLAG) {
+			list_splice_tail_init(&block->list, &lc->logging_blocks);
+			list_add_tail(&block->list, &lc->logging_blocks);
+			wake_up_process(lc->log_kthread);
+		} else if (block->flags & LOG_FUA_FLAG) {
+			list_add_tail(&block->list, &lc->logging_blocks);
+			wake_up_process(lc->log_kthread);
+		} else
+			list_add_tail(&block->list, &lc->unflushed_blocks);
+		spin_unlock_irqrestore(&lc->blocks_lock, flags);
+	}
+
+	return error;
+}
+
+/*
+ * INFO format: <logged entries> <highest allocated sector>
+ */
+static void log_writes_status(struct dm_target *ti, status_type_t type,
+			      unsigned status_flags, char *result,
+			      unsigned maxlen)
+{
+	unsigned sz = 0;
+	struct log_writes_c *lc = ti->private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%llu %llu", lc->logged_entries,
+		       (unsigned long long)lc->next_sector - 1);
+		if (!lc->logging_enabled)
+			DMEMIT(" logging_disabled");
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
+		break;
+	}
+}
+
+static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct log_writes_c *lc = ti->private;
+	struct dm_dev *dev = lc->dev;
+	int r = 0;
+
+	/*
+	 * Only pass ioctls through if the device sizes match exactly.
+	 */
+	if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+		r = scsi_verify_blk_ioctl(NULL, cmd);
+
+	return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+}
+
+static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+			    struct bio_vec *biovec, int max_size)
+{
+	struct log_writes_c *lc = ti->private;
+	struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = lc->dev->bdev;
+	bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int log_writes_iterate_devices(struct dm_target *ti,
+				      iterate_devices_callout_fn fn,
+				      void *data)
+{
+	struct log_writes_c *lc = ti->private;
+
+	return fn(ti, lc->dev, 0, ti->len, data);
+}
+
+/*
+ * Messages supported:
+ *   mark <mark data> - specify the marked data.
+ */
+static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int r = -EINVAL;
+	struct log_writes_c *lc = ti->private;
+
+	if (argc != 2) {
+		DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
+		return r;
+	}
+
+	if (!strcasecmp(argv[0], "mark"))
+		r = log_mark(lc, argv[1]);
+	else
+		DMWARN("Unrecognised log writes target message received: %s", argv[0]);
+
+	return r;
+}
+
+static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct log_writes_c *lc = ti->private;
+	struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+	if (!q || !blk_queue_discard(q)) {
+		lc->device_supports_discard = false;
+		limits->discard_granularity = 1 << SECTOR_SHIFT;
+		limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
+	}
+}
+
+static struct target_type log_writes_target = {
+	.name   = "log-writes",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr    = log_writes_ctr,
+	.dtr    = log_writes_dtr,
+	.map    = log_writes_map,
+	.end_io = normal_end_io,
+	.status = log_writes_status,
+	.ioctl	= log_writes_ioctl,
+	.merge	= log_writes_merge,
+	.message = log_writes_message,
+	.iterate_devices = log_writes_iterate_devices,
+	.io_hints = log_writes_io_hints,
+};
+
+static int __init dm_log_writes_init(void)
+{
+	int r = dm_register_target(&log_writes_target);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	return r;
+}
+
+static void __exit dm_log_writes_exit(void)
+{
+	dm_unregister_target(&log_writes_target);
+}
+
+module_init(dm_log_writes_init);
+module_exit(dm_log_writes_exit);
+
+MODULE_DESCRIPTION(DM_NAME " log writes target");
+MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
+MODULE_LICENSE("GPL");

From e44f23b32dc7916b2bc12817e2f723fefa21ba41 Mon Sep 17 00:00:00 2001
From: Milan Broz <gmazyland@gmail.com>
Date: Sun, 5 Apr 2015 18:03:10 +0200
Subject: [PATCH 31/34] dm crypt: update URLs to new cryptsetup project page

Cryptsetup home page moved to GitLab.
Also remove link to abandonded Truecrypt page.

Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-crypt.txt | 4 ++--
 Documentation/device-mapper/verity.txt   | 4 ++--
 drivers/md/dm-crypt.c                    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index ad697781f9ac..692171fe9da0 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -5,7 +5,7 @@ Device-Mapper's "crypt" target provides transparent encryption of block devices
 using the kernel crypto API.
 
 For a more detailed description of supported parameters see:
-http://code.google.com/p/cryptsetup/wiki/DMCrypt
+https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt
 
 Parameters: <cipher> <key> <iv_offset> <device path> \
 	      <offset> [<#opt_params> <opt_params>]
@@ -80,7 +80,7 @@ Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
 encryption with dm-crypt using the 'cryptsetup' utility, see
-http://code.google.com/p/cryptsetup/
+https://gitlab.com/cryptsetup/cryptsetup
 
 [[
 #!/bin/sh
diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
index 64ccc5a079a5..e15bc1a0fb98 100644
--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt
@@ -142,7 +142,7 @@ block boundary) are the hash blocks which are stored a depth at a time
 
 The full specification of kernel parameters and on-disk metadata format
 is available at the cryptsetup project's wiki page
-  http://code.google.com/p/cryptsetup/wiki/DMVerity
+  https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity
 
 Status
 ======
@@ -159,7 +159,7 @@ Set up a device:
 
 A command line tool veritysetup is available to compute or verify
 the hash tree or activate the kernel device. This is available from
-the cryptsetup upstream repository http://code.google.com/p/cryptsetup/
+the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/
 (as a libcryptsetup extension).
 
 Create hash on the device:
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 713a96237a80..ea09d5464a9f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -228,7 +228,7 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
  *
  * tcw:  Compatible implementation of the block chaining mode used
  *       by the TrueCrypt device encryption system (prior to version 4.1).
- *       For more info see: http://www.truecrypt.org
+ *       For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat
  *       It operates on full 512 byte sectors and uses CBC
  *       with an IV derived from initial key and the sector number.
  *       In addition, whitening value is applied on every sector, whitening

From 5977907937afa2b5584a874d44ba6c0f56aeaa9c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Apr 2015 16:53:24 -0400
Subject: [PATCH 32/34] dm crypt: leverage immutable biovecs when decrypting on
 read

Commit 003b5c571 ("block: Convert drivers to immutable biovecs")
stopped short of changing dm-crypt to leverage the fact that the biovec
array of a bio will no longer be modified.

Switch to using bio_clone_fast() when cloning bios for decryption after
read.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ea09d5464a9f..aa1238facbeb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1124,15 +1124,15 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 	struct crypt_config *cc = io->cc;
-	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
 
 	/*
-	 * The block layer might modify the bvec array, so always
-	 * copy the required bvecs because we need the original
-	 * one in order to decrypt the whole bio data *afterwards*.
+	 * We need the original biovec array in order to decrypt
+	 * the whole bio data *afterwards* -- thanks to immutable
+	 * biovecs we don't need to worry about the block layer
+	 * modifying the biovec array; so leverage bio_clone_fast().
 	 */
-	clone = bio_clone_bioset(base_bio, gfp, cc->bs);
+	clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
 	if (!clone)
 		return 1;
 

From 0618764cb25f6fa9fb31152995de42a8a0496475 Mon Sep 17 00:00:00 2001
From: Ben Collins <ben.c@servergy.com>
Date: Fri, 3 Apr 2015 16:09:46 +0000
Subject: [PATCH 33/34] dm crypt: fix deadlock when async crypto algorithm
 returns -EBUSY

I suspect this doesn't show up for most anyone because software
algorithms typically don't have a sense of being too busy.  However,
when working with the Freescale CAAM driver it will return -EBUSY on
occasion under heavy -- which resulted in dm-crypt deadlock.

After checking the logic in some other drivers, the scheme for
crypt_convert() and it's callback, kcryptd_async_done(), were not
correctly laid out to properly handle -EBUSY or -EINPROGRESS.

Fix this by using the completion for both -EBUSY and -EINPROGRESS.  Now
crypt_convert()'s use of completion is comparable to
af_alg_wait_for_completion().  Similarly, kcryptd_async_done() follows
the pattern used in af_alg_complete().

Before this fix dm-crypt would lockup within 1-2 minutes running with
the CAAM driver.  Fix was regression tested against software algorithms
on PPC32 and x86_64, and things seem perfectly happy there as well.

Signed-off-by: Ben Collins <ben.c@servergy.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-crypt.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index aa1238facbeb..9b5e1eb0ffcf 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -925,11 +925,10 @@ static int crypt_convert(struct crypt_config *cc,
 
 		switch (r) {
 		/* async */
+		case -EINPROGRESS:
 		case -EBUSY:
 			wait_for_completion(&ctx->restart);
 			reinit_completion(&ctx->restart);
-			/* fall through*/
-		case -EINPROGRESS:
 			ctx->req = NULL;
 			ctx->cc_sector++;
 			continue;
@@ -1346,10 +1345,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
 	struct crypt_config *cc = io->cc;
 
-	if (error == -EINPROGRESS) {
-		complete(&ctx->restart);
+	if (error == -EINPROGRESS)
 		return;
-	}
 
 	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
 		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
@@ -1360,12 +1357,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
 	if (!atomic_dec_and_test(&ctx->cc_pending))
-		return;
+		goto done;
 
 	if (bio_data_dir(io->base_bio) == READ)
 		kcryptd_crypt_read_done(io);
 	else
 		kcryptd_crypt_write_io_submit(io, 1);
+done:
+	if (!completion_done(&ctx->restart))
+		complete(&ctx->restart);
 }
 
 static void kcryptd_crypt(struct work_struct *work)

From 44c144f9c8e8fbd73ede2848da8253b3aae42ec2 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Thu, 16 Apr 2015 22:00:50 -0400
Subject: [PATCH 34/34] dm crypt: fix missing error code return from crypt_ctr
 error path

Fix to return a negative error code from crypt_ctr()'s optional
parameter processing error path.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9b5e1eb0ffcf..9eeea196328a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1816,6 +1816,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		if (ret)
 			goto bad;
 
+		ret = -EINVAL;
 		while (opt_params--) {
 			opt_string = dm_shift_arg(&as);
 			if (!opt_string) {