From 1af2048a3e87b4e982c53ad8cfb0c75d1a9c0a73 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 2 Dec 2017 01:03:48 +0100
Subject: [PATCH 01/69] dm raid: fix deadlock caused by premature
 md_stop_writes()

md_stop_writes() is called in raid_presuspend() causing deadlocks on
bios submitted afterwards -- which happens on loaded raid sets with
conversion requests.

Fix by moving md_stop_writes() to raid_postsuspend().  NOTE: when the
recovery's frozen (MD_RECOVERY_FROZEN), writes haven't been started (or
are already stopped) so don't stop them again.

Also remove superfluous readonly setting.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6319d846e0ad..398314b6c31a 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3613,24 +3613,19 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
 }
 
-static void raid_presuspend(struct dm_target *ti)
-{
-	struct raid_set *rs = ti->private;
-
-	md_stop_writes(&rs->md);
-}
-
 static void raid_postsuspend(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
 	if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
+		/* Writes have to be stopped before suspending to avoid deadlocks. */
+		if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
+			md_stop_writes(&rs->md);
+
 		mddev_lock_nointr(&rs->md);
 		mddev_suspend(&rs->md);
 		mddev_unlock(&rs->md);
 	}
-
-	rs->md.ro = 1;
 }
 
 static void attempt_restore_of_faulty_devices(struct raid_set *rs)
@@ -3903,7 +3898,6 @@ static struct target_type raid_target = {
 	.message = raid_message,
 	.iterate_devices = raid_iterate_devices,
 	.io_hints = raid_io_hints,
-	.presuspend = raid_presuspend,
 	.postsuspend = raid_postsuspend,
 	.preresume = raid_preresume,
 	.resume = raid_resume,

From 052b2b1e0689b30af2608d908916a16e9dbd0919 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 2 Dec 2017 01:03:49 +0100
Subject: [PATCH 02/69] dm raid: consume sizes after md_finish_reshape()
 completes changing them

The md raid personalities call md_finish_reshape() at the end of a
reshape conversion which adjusts rdev->sectors.

Correct/check rdev->sectors before initiating a reshape and raise the
recovery pointer accordingly.

Otherwise, the DM raid coordinated reshape will fail.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 42 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 398314b6c31a..c3ea4337bf51 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2640,12 +2640,19 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
 	 * Make sure we got a minimum amount of free sectors per device
 	 */
 	if (rs->data_offset &&
-	    to_sector(i_size_read(rdev->bdev->bd_inode)) - rdev->sectors < MIN_FREE_RESHAPE_SPACE) {
+	    to_sector(i_size_read(rdev->bdev->bd_inode)) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) {
 		rs->ti->error = data_offset ? "No space for forward reshape" :
 					      "No space for backward reshape";
 		return -ENOSPC;
 	}
 out:
+	/*
+	 * Raise recovery_cp in case data_offset != 0 to
+	 * avoid false recovery positives in the constructor.
+	 */
+	if (rs->md.recovery_cp < rs->md.dev_sectors)
+		rs->md.recovery_cp += rs->dev[0].rdev.data_offset;
+
 	/* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
 	rdev_for_each(rdev, &rs->md) {
 		if (!test_bit(Journal, &rdev->flags)) {
@@ -2777,6 +2784,23 @@ static int rs_prepare_reshape(struct raid_set *rs)
 	return 0;
 }
 
+/* Get reshape sectors from data_offsets or raid set */
+static sector_t _get_reshape_sectors(struct raid_set *rs)
+{
+	struct md_rdev *rdev;
+	sector_t reshape_sectors = 0;
+
+	rdev_for_each(rdev, &rs->md)
+		if (!test_bit(Journal, &rdev->flags)) {
+			reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ?
+					rdev->data_offset - rdev->new_data_offset :
+					rdev->new_data_offset - rdev->data_offset;
+			break;
+		}
+
+	return max(reshape_sectors, (sector_t) rs->data_offset);
+}
+
 /*
  *
  * - change raid layout
@@ -2788,6 +2812,7 @@ static int rs_setup_reshape(struct raid_set *rs)
 {
 	int r = 0;
 	unsigned int cur_raid_devs, d;
+	sector_t reshape_sectors = _get_reshape_sectors(rs);
 	struct mddev *mddev = &rs->md;
 	struct md_rdev *rdev;
 
@@ -2804,13 +2829,13 @@ static int rs_setup_reshape(struct raid_set *rs)
 	/*
 	 * Adjust array size:
 	 *
-	 * - in case of adding disks, array size has
+	 * - in case of adding disk(s), array size has
 	 *   to grow after the disk adding reshape,
 	 *   which'll hapen in the event handler;
 	 *   reshape will happen forward, so space has to
 	 *   be available at the beginning of each disk
 	 *
-	 * - in case of removing disks, array size
+	 * - in case of removing disk(s), array size
 	 *   has to shrink before starting the reshape,
 	 *   which'll happen here;
 	 *   reshape will happen backward, so space has to
@@ -2841,7 +2866,7 @@ static int rs_setup_reshape(struct raid_set *rs)
 			rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector;
 		}
 
-		mddev->reshape_backwards = 0; /* adding disks -> forward reshape */
+		mddev->reshape_backwards = 0; /* adding disk(s) -> forward reshape */
 
 	/* Remove disk(s) */
 	} else if (rs->delta_disks < 0) {
@@ -2874,6 +2899,15 @@ static int rs_setup_reshape(struct raid_set *rs)
 		mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1;
 	}
 
+	/*
+	 * Adjust device size for forward reshape
+	 * because md_finish_reshape() reduces it.
+	 */
+	if (!mddev->reshape_backwards)
+		rdev_for_each(rdev, &rs->md)
+			if (!test_bit(Journal, &rdev->flags))
+				rdev->sectors += reshape_sectors;
+
 	return r;
 }
 

From 7501537ee3a5e6bd01c0084af141e4fa84e652c0 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 2 Dec 2017 01:03:50 +0100
Subject: [PATCH 03/69] dm raid: correct resizing state relative to reshape
 space in ctr

Pay attention to existing reshape space to define if a raid set needs
resizing.  Otherwise we can hit "Can't resize a reshaping raid set"
when a reshape is being requested.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c3ea4337bf51..c4b0cb181fbc 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2969,10 +2969,10 @@ static void configure_discard_support(struct raid_set *rs)
 static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	int r;
-	bool resize;
+	bool resize = false;
 	struct raid_type *rt;
 	unsigned int num_raid_params, num_raid_devs;
-	sector_t calculated_dev_sectors, rdev_sectors;
+	sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors;
 	struct raid_set *rs = NULL;
 	const char *arg;
 	struct rs_layout rs_layout;
@@ -3055,7 +3055,10 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	resize = calculated_dev_sectors != rdev_sectors;
+
+	reshape_sectors = _get_reshape_sectors(rs);
+	if (calculated_dev_sectors != rdev_sectors)
+		resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors);
 
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
@@ -3178,7 +3181,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	mddev_lock_nointr(&rs->md);
 	r = md_run(&rs->md);
 	rs->md.in_sync = 0; /* Assume already marked dirty */
-
 	if (r) {
 		ti->error = "Failed to run raid array";
 		mddev_unlock(&rs->md);

From 61e06e2c3ebd986050958513bfa40dceed756f8f Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 2 Dec 2017 01:03:51 +0100
Subject: [PATCH 04/69] dm raid: fix raid set size revalidation

The raid set size is being revalidated unconditionally before a
reshaping conversion is started.  MD requires the size to only be
reduced in case of a stripe removing (i.e. shrinking) reshape but not
when growing because the raid array has to stay small until after the
growing reshape finishes.

Fix by avoiding the size revalidation in preresume unless a shrinking
reshape is requested.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c4b0cb181fbc..ff75324133fb 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -675,15 +675,11 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
 	return NULL;
 }
 
-/*
- * Conditionally change bdev capacity of @rs
- * in case of a disk add/remove reshape
- */
-static void rs_set_capacity(struct raid_set *rs)
+/* Adjust rdev sectors */
+static void rs_set_rdev_sectors(struct raid_set *rs)
 {
 	struct mddev *mddev = &rs->md;
 	struct md_rdev *rdev;
-	struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
 
 	/*
 	 * raid10 sets rdev->sector to the device size, which
@@ -692,8 +688,16 @@ static void rs_set_capacity(struct raid_set *rs)
 	rdev_for_each(rdev, mddev)
 		if (!test_bit(Journal, &rdev->flags))
 			rdev->sectors = mddev->dev_sectors;
+}
 
-	set_capacity(gendisk, mddev->array_sectors);
+/*
+ * Change bdev capacity of @rs in case of a disk add/remove reshape
+ */
+static void rs_set_capacity(struct raid_set *rs)
+{
+	struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
+
+	set_capacity(gendisk, rs->md.array_sectors);
 	revalidate_disk(gendisk);
 }
 
@@ -1674,8 +1678,11 @@ static void do_table_event(struct work_struct *ws)
 	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
 
 	smp_rmb(); /* Make sure we access most actual mddev properties */
-	if (!rs_is_reshaping(rs))
+	if (!rs_is_reshaping(rs)) {
+		if (rs_is_raid10(rs))
+			rs_set_rdev_sectors(rs);
 		rs_set_capacity(rs);
+	}
 	dm_table_event(rs->ti->table);
 }
 
@@ -3873,11 +3880,10 @@ static int raid_preresume(struct dm_target *ti)
 		mddev->resync_min = mddev->recovery_cp;
 	}
 
-	rs_set_capacity(rs);
-
 	/* Check for any reshape request unless new raid set */
 	if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
 		/* Initiate a reshape. */
+		rs_set_rdev_sectors(rs);
 		mddev_lock_nointr(mddev);
 		r = rs_start_reshape(rs);
 		mddev_unlock(mddev);
@@ -3906,6 +3912,10 @@ static void raid_resume(struct dm_target *ti)
 	mddev->ro = 0;
 	mddev->in_sync = 0;
 
+	/* Only reduce raid set size before running a disk removing reshape. */
+	if (mddev->delta_disks < 0)
+		rs_set_capacity(rs);
+
 	/*
 	 * Keep the RAID set frozen if reshape/rebuild flags are set.
 	 * The RAID set is unfrozen once the next table load/resume,

From 188a212df1f3a2d7ea9bb0fc0ab4173042c23470 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 2 Dec 2017 01:03:59 +0100
Subject: [PATCH 05/69] dm raid: add component device size checks to avoid
 runtime failure

Check all component data device sizes versus calculated size.
Reject if device(s) are too small.  Otherwise, MD will fail the
operation by accessing beyond the end of the data device.

An example use-case is that growing bitmap won't fit any more and the MD
runtime will report an error when DM raid should catch this earlier.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index ff75324133fb..2bb0ac7c3fba 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1580,6 +1580,24 @@ static sector_t __rdev_sectors(struct raid_set *rs)
 	return 0;
 }
 
+/* Check that calculated dev_sectors fits all component devices. */
+static int _check_data_dev_sectors(struct raid_set *rs)
+{
+	sector_t ds = ~0;
+	struct md_rdev *rdev;
+
+	rdev_for_each(rdev, &rs->md)
+		if (!test_bit(Journal, &rdev->flags) && rdev->bdev) {
+			ds = min(ds, to_sector(i_size_read(rdev->bdev->bd_inode)));
+			if (ds < rs->md.dev_sectors) {
+				rs->ti->error = "Component device(s) too small";
+				return -EINVAL;
+			}
+		}
+
+	return 0;
+}
+
 /* Calculate the sectors per device and per array used for @rs */
 static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 {
@@ -1629,7 +1647,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 	mddev->array_sectors = array_sectors;
 	mddev->dev_sectors = dev_sectors;
 
-	return 0;
+	return _check_data_dev_sectors(rs);
 bad:
 	rs->ti->error = "Target length not divisible by number of data devices";
 	return -EINVAL;

From d39f0010e40964d959c5157be02839da8a178015 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 2 Dec 2017 01:03:52 +0100
Subject: [PATCH 06/69] dm raid: fix raid_resume() to keep raid set frozen as
 needed

During a reshape request: if userspace reloads a "raid" table multiple
times, resulting in multiple superblock reads, the raid set needs to
stay frozen until all config changes (chunk size, layout data_offset,
delta_disks) have been stored in the superblocks and respective flags
cleared.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2bb0ac7c3fba..bf3c9e3c736d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3899,7 +3899,7 @@ static int raid_preresume(struct dm_target *ti)
 	}
 
 	/* Check for any reshape request unless new raid set */
-	if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
+	if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
 		/* Initiate a reshape. */
 		rs_set_rdev_sectors(rs);
 		mddev_lock_nointr(mddev);
@@ -3941,8 +3941,14 @@ static void raid_resume(struct dm_target *ti)
 	 * This ensures that the constructor for the inactive table
 	 * retrieves an up-to-date reshape_position.
 	 */
-	if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
-		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	if (!test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags) &&
+	    !(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) {
+		if (rs_is_reshapable(rs)) {
+			if (!rs_is_reshaping(rs) || _get_reshape_sectors(rs))
+				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+		} else
+			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	}
 
 	if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
 		mddev_lock_nointr(mddev);

From 67143510a7e3634a23f06a48445d1148b2fdbc4d Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 2 Dec 2017 01:03:53 +0100
Subject: [PATCH 07/69] dm raid: display a consistent copy of the MD status via
 raid_status()

The MD sync thread updates recovery flags providing state of any
running, idle, frozen, recovering, reshaping, ... activity it performs
and updates respective flags asynchronously versus dm processing
raid_status().  To close that race window, take a single copy of the
flags and pass it into its callees.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index bf3c9e3c736d..3df7c5bd5a9b 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3300,25 +3300,25 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
 }
 
 /* Return string describing the current sync action of @mddev */
-static const char *decipher_sync_action(struct mddev *mddev)
+static const char *decipher_sync_action(struct mddev *mddev, unsigned long recovery)
 {
-	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
 		return "frozen";
 
-	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
-		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+	if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
+	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
+		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
 			return "reshape";
 
-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
+			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
 				return "resync";
-			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
 				return "check";
 			return "repair";
 		}
 
-		if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_RECOVER, &recovery))
 			return "recover";
 	}
 
@@ -3350,7 +3350,7 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev,
 }
 
 /* Helper to return resync/reshape progress for @rs and @array_in_sync */
-static sector_t rs_get_progress(struct raid_set *rs,
+static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 				sector_t resync_max_sectors, bool *array_in_sync)
 {
 	sector_t r, curr_resync_completed;
@@ -3367,7 +3367,7 @@ static sector_t rs_get_progress(struct raid_set *rs,
 		r = mddev->reshape_position;
 
 		/* Reshape is relative to the array size */
-		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
+		if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
 		    r != MaxSector) {
 			if (r == MaxSector) {
 				*array_in_sync = true;
@@ -3382,20 +3382,20 @@ static sector_t rs_get_progress(struct raid_set *rs,
 			}
 
 		/* Sync is relative to the component device size */
-		} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		} else if (test_bit(MD_RECOVERY_RUNNING, &recovery))
 			r = curr_resync_completed;
 		else
 			r = mddev->recovery_cp;
 
 		if ((r == MaxSector) ||
-		    (test_bit(MD_RECOVERY_DONE, &mddev->recovery) &&
+		    (test_bit(MD_RECOVERY_DONE, &recovery) &&
 		     (mddev->curr_resync_completed == resync_max_sectors))) {
 			/*
 			 * Sync complete.
 			 */
 			*array_in_sync = true;
 			r = resync_max_sectors;
-		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+		} else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
 			/*
 			 * If "check" or "repair" is occurring, the raid set has
 			 * undergone an initial sync and the health characters
@@ -3438,6 +3438,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	struct r5conf *conf = mddev->private;
 	int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0;
 	bool array_in_sync;
+	unsigned long recovery;
 	unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
 	unsigned int sz = 0;
 	unsigned int rebuild_disks;
@@ -3457,13 +3458,14 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
 		/* Access most recent mddev properties for status output */
 		smp_rmb();
+		recovery = rs->md.recovery;
 		/* Get sensible max sectors even if raid set not yet started */
 		resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
 				      mddev->resync_max_sectors : mddev->dev_sectors;
-		progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync);
+		progress = rs_get_progress(rs, recovery, resync_max_sectors, &array_in_sync);
 		resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
 				    atomic64_read(&mddev->resync_mismatches) : 0;
-		sync_action = decipher_sync_action(&rs->md);
+		sync_action = decipher_sync_action(&rs->md, recovery);
 
 		/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
 		for (i = 0; i < rs->raid_disks; i++)

From 242ea5ad11a03f2fbdfc2fe422d8e1b0601a8073 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 2 Dec 2017 01:03:54 +0100
Subject: [PATCH 08/69] dm raid: avoid passing array_in_sync variable to
 raid_status() callees

The raid_status() function passes the bool array_in_sync variable around
providing synchronization state of the MD array.  Replace it with a
runtime flag.  This will avoid a pattern of having to pass discrete
variables to various functions.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 3df7c5bd5a9b..5730b32034aa 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -209,6 +209,7 @@ struct raid_dev {
 #define RT_FLAG_UPDATE_SBS		3
 #define RT_FLAG_RESHAPE_RS		4
 #define RT_FLAG_RS_SUSPENDED		5
+#define RT_FLAG_RS_IN_SYNC		6
 
 /* Array elements of 64 bit needed for rebuild/failed disk bits */
 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -3335,7 +3336,7 @@ static const char *decipher_sync_action(struct mddev *mddev, unsigned long recov
  *  'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
  *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
  */
-static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
+static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev)
 {
 	if (!rdev->bdev)
 		return "-";
@@ -3343,25 +3344,27 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev,
 		return "D";
 	else if (test_bit(Journal, &rdev->flags))
 		return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
-	else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
+	else if (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) &&
+		 !test_bit(In_sync, &rdev->flags))
 		return "a";
 	else
 		return "A";
 }
 
-/* Helper to return resync/reshape progress for @rs and @array_in_sync */
+/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */
 static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
-				sector_t resync_max_sectors, bool *array_in_sync)
+				sector_t resync_max_sectors)
 {
 	sector_t r, curr_resync_completed;
 	struct mddev *mddev = &rs->md;
 
+	clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+
 	curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp;
-	*array_in_sync = false;
 
 	if (rs_is_raid0(rs)) {
 		r = resync_max_sectors;
-		*array_in_sync = true;
+		set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 
 	} else {
 		r = mddev->reshape_position;
@@ -3370,7 +3373,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 		if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
 		    r != MaxSector) {
 			if (r == MaxSector) {
-				*array_in_sync = true;
+				set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 				r = resync_max_sectors;
 			} else {
 				/* Got to reverse on backward reshape */
@@ -3393,7 +3396,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 			/*
 			 * Sync complete.
 			 */
-			*array_in_sync = true;
+			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 			r = resync_max_sectors;
 		} else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
 			/*
@@ -3401,7 +3404,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 			 * undergone an initial sync and the health characters
 			 * should not be 'a' anymore.
 			 */
-			*array_in_sync = true;
+			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 		} else {
 			struct md_rdev *rdev;
 
@@ -3414,7 +3417,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 			rdev_for_each(rdev, mddev)
 				if (!test_bit(Journal, &rdev->flags) &&
 				    !test_bit(In_sync, &rdev->flags))
-					*array_in_sync = true;
+					set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 #if 0
 			r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
 #endif
@@ -3437,7 +3440,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	struct mddev *mddev = &rs->md;
 	struct r5conf *conf = mddev->private;
 	int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0;
-	bool array_in_sync;
 	unsigned long recovery;
 	unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
 	unsigned int sz = 0;
@@ -3462,14 +3464,14 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		/* Get sensible max sectors even if raid set not yet started */
 		resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
 				      mddev->resync_max_sectors : mddev->dev_sectors;
-		progress = rs_get_progress(rs, recovery, resync_max_sectors, &array_in_sync);
+		progress = rs_get_progress(rs, recovery, resync_max_sectors);
 		resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
 				    atomic64_read(&mddev->resync_mismatches) : 0;
 		sync_action = decipher_sync_action(&rs->md, recovery);
 
 		/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
 		for (i = 0; i < rs->raid_disks; i++)
-			DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
+			DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev));
 
 		/*
 		 * In-sync/Reshape ratio:
@@ -3520,7 +3522,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		 * v1.10.0+:
 		 */
 		DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
-			      __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
+			      __raid_dev_status(rs, &rs->journal_dev.rdev) : "-");
 		break;
 
 	case STATUSTYPE_TABLE:

From 4102d9de6d375fc27ec70382c4068f4f9f62ce4f Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 2 Dec 2017 01:03:55 +0100
Subject: [PATCH 09/69] dm raid: fix rs_get_progress() synchronization
 state/ratio

Fix various sync state issues causing racy/bogus sync ratio,
sync_action ad health chars in dm_status() info output.

Sync ratio could be N/N (i.e. 100%) shortly after raid set
creation, i.e. creating a new RaidLV or upconverting a linear LV to
raid1 thus:
  "0 2097152 raid raid1 2 Aa 2097162/2097152 recover 0 0 -"
instead of:
  "0 2097152 raid raid1 2 Aa 0/2097152 idle 0 0 -"

Sync action could be non-idle, when the MD thread was done with io.

Health chars could be 'A' when they should be 'a' for a short time
before a resynchonization started.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 95 +++++++++++++++++++++++++++++---------------
 1 file changed, 64 insertions(+), 31 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 5730b32034aa..7e7075fb9c28 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -210,6 +210,7 @@ struct raid_dev {
 #define RT_FLAG_RESHAPE_RS		4
 #define RT_FLAG_RS_SUSPENDED		5
 #define RT_FLAG_RS_IN_SYNC		6
+#define RT_FLAG_RS_RESYNCING		7
 
 /* Array elements of 64 bit needed for rebuild/failed disk bits */
 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -3306,8 +3307,10 @@ static const char *decipher_sync_action(struct mddev *mddev, unsigned long recov
 	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
 		return "frozen";
 
-	if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
-	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
+	/* The MD sync thread can be done with io but still be running */
+	if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
+	    (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
+	     (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
 		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
 			return "reshape";
 
@@ -3344,8 +3347,9 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev)
 		return "D";
 	else if (test_bit(Journal, &rdev->flags))
 		return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
-	else if (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) &&
-		 !test_bit(In_sync, &rdev->flags))
+	else if (test_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags) ||
+		 (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) &&
+		  !test_bit(In_sync, &rdev->flags)))
 		return "a";
 	else
 		return "A";
@@ -3355,49 +3359,70 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev)
 static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 				sector_t resync_max_sectors)
 {
-	sector_t r, curr_resync_completed;
+	sector_t r;
 	struct mddev *mddev = &rs->md;
 
 	clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
-
-	curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp;
+	clear_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
 
 	if (rs_is_raid0(rs)) {
 		r = resync_max_sectors;
 		set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 
 	} else {
-		r = mddev->reshape_position;
-
 		/* Reshape is relative to the array size */
-		if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
-		    r != MaxSector) {
-			if (r == MaxSector) {
-				set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
-				r = resync_max_sectors;
-			} else {
+		if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) {
+			r = mddev->reshape_position;
+			if (r != MaxSector) {
 				/* Got to reverse on backward reshape */
 				if (mddev->reshape_backwards)
 					r = mddev->array_sectors - r;
 
-				/* Devide by # of data stripes */
-				sector_div(r, mddev_data_stripes(rs));
+				/* Divide by # of data stripes unless raid1 */
+				if (!rs_is_raid1(rs))
+					sector_div(r, mddev_data_stripes(rs));
 			}
 
-		/* Sync is relative to the component device size */
-		} else if (test_bit(MD_RECOVERY_RUNNING, &recovery))
-			r = curr_resync_completed;
+		/*
+		 * Sync/recover is relative to the component device size.
+		 *
+		 * MD_RECOVERY_NEEDED for https://bugzilla.redhat.com/show_bug.cgi?id=1508070
+		 */
+		} else if (test_bit(MD_RECOVERY_NEEDED, &recovery) ||
+			   test_bit(MD_RECOVERY_RUNNING, &recovery))
+			r = mddev->curr_resync_completed;
+
 		else
 			r = mddev->recovery_cp;
 
-		if ((r == MaxSector) ||
-		    (test_bit(MD_RECOVERY_DONE, &recovery) &&
-		     (mddev->curr_resync_completed == resync_max_sectors))) {
+		if (r >= resync_max_sectors &&
+		    (!test_bit(MD_RECOVERY_REQUESTED, &recovery) ||
+		     (!test_bit(MD_RECOVERY_FROZEN, &recovery) &&
+		      !test_bit(MD_RECOVERY_NEEDED, &recovery) &&
+		      !test_bit(MD_RECOVERY_RUNNING, &recovery)))) {
 			/*
 			 * Sync complete.
 			 */
-			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
-			r = resync_max_sectors;
+			/* In case we have finished recovering, the array is in sync. */
+			if (test_bit(MD_RECOVERY_RECOVER, &recovery))
+				set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+
+		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) {
+			/*
+			 * In case we are recovering, the array is not in sync
+			 * and health chars should show the recovering legs.
+			 */
+			;
+
+		} else if (test_bit(MD_RECOVERY_SYNC, &recovery) &&
+			   !test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
+			/*
+			 * If "resync" is occurring, the raid set
+			 * is or may be out of sync hence the health
+			 * characters shall be 'a'.
+			 */
+			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+
 		} else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
 			/*
 			 * If "check" or "repair" is occurring, the raid set has
@@ -3405,26 +3430,34 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 			 * should not be 'a' anymore.
 			 */
 			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+
 		} else {
 			struct md_rdev *rdev;
 
+			/*
+			 * We are idle and recovery is needed, prevent 'A' chars race
+			 * caused by components still set to in-sync by constrcuctor.
+			 */
+			if (test_bit(MD_RECOVERY_NEEDED, &recovery))
+				set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+
 			/*
 			 * The raid set may be doing an initial sync, or it may
 			 * be rebuilding individual components.	 If all the
 			 * devices are In_sync, then it is the raid set that is
 			 * being initialized.
 			 */
+			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 			rdev_for_each(rdev, mddev)
 				if (!test_bit(Journal, &rdev->flags) &&
-				    !test_bit(In_sync, &rdev->flags))
-					set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
-#if 0
-			r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
-#endif
+				    !test_bit(In_sync, &rdev->flags)) {
+					clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+					break;
+				}
 		}
 	}
 
-	return r;
+	return min(r, resync_max_sectors);
 }
 
 /* Helper to return @dev name or "-" if !@dev */

From 78a75d10ef869f4fae70f9b86afce28eb1922529 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 2 Dec 2017 01:03:56 +0100
Subject: [PATCH 10/69] dm raid: small cleanup and remove unsed "struct
 raid_set" member

Move raid_resume()'s setting of 'rw' and 'in_sync' to just prior to
mddev_resume().

Also, remove unused 'bitmap_loaded' member from "struct raid_set".

No functional changes.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 7e7075fb9c28..1069e617e727 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -227,7 +227,6 @@ struct rs_layout {
 struct raid_set {
 	struct dm_target *ti;
 
-	uint32_t bitmap_loaded;
 	uint32_t stripe_cache_entries;
 	unsigned long ctr_flags;
 	unsigned long runtime_flags;
@@ -3964,9 +3963,6 @@ static void raid_resume(struct dm_target *ti)
 		attempt_restore_of_faulty_devices(rs);
 	}
 
-	mddev->ro = 0;
-	mddev->in_sync = 0;
-
 	/* Only reduce raid set size before running a disk removing reshape. */
 	if (mddev->delta_disks < 0)
 		rs_set_capacity(rs);
@@ -3989,6 +3985,8 @@ static void raid_resume(struct dm_target *ti)
 
 	if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
 		mddev_lock_nointr(mddev);
+		mddev->ro = 0;
+		mddev->in_sync = 0;
 		mddev_resume(mddev);
 		mddev_unlock(mddev);
 	}

From b84cf26924cfe405993fc45fa2911cde38f3c3ac Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 4 Dec 2017 10:26:21 -0500
Subject: [PATCH 11/69] dm raid: bump target version to reflect numerous fixes

Also update Documentation accordingly.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt | 4 +++-
 drivers/md/dm-raid.c                    | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 32df07e29f68..7b22375091fa 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -343,5 +343,7 @@ Version History
 1.11.0  Fix table line argument order
 	(wrong raid10_copies/raid10_format sequence)
 1.11.1  Add raid4/5/6 journal write-back support via journal_mode option
-1.12.1  fix for MD deadlock between mddev_suspend() and md_write_start() available
+1.12.1  Fix for MD deadlock between mddev_suspend() and md_write_start() available
 1.13.0  Fix dev_health status at end of "recover" (was 'a', now 'A')
+1.13.1  Fix deadlock caused by early md_stop_writes().  Also fix size an
+	state races.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1069e617e727..764baa9665bb 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3994,7 +3994,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 13, 0},
+	.version = {1, 13, 1},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,

From 53bf5384f9b9e37c628f171366959a38c89779ca Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 13 Dec 2017 17:13:17 +0100
Subject: [PATCH 12/69] dm raid: validate current raid sets redundancy

Verifying the current raid sets redundancy based on retrieved
superblock content has to use the superblock's raid level (e.g. raid0),
not the constructor requested one (e.g. raid10).

Using the requested raid level of raid10 lead to a "divide error"
on raid0 which defines data copies divided by to be zero.

Also check for bogus data copies.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 764baa9665bb..b82b7095a671 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1007,7 +1007,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		    !rs->dev[i].rdev.sb_page)
 			rebuild_cnt++;
 
-	switch (rs->raid_type->level) {
+	switch (rs->md.level) {
 	case 0:
 		break;
 	case 1:
@@ -1022,6 +1022,11 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		break;
 	case 10:
 		copies = raid10_md_layout_to_copies(rs->md.new_layout);
+		if (copies < 2) {
+			DMERR("Bogus raid10 data copies < 2!");
+			return -EINVAL;
+		}
+
 		if (rebuild_cnt < copies)
 			break;
 

From 11e4723206683ad59f8e9dc7771e7b44a37f7b62 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 13 Dec 2017 17:13:18 +0100
Subject: [PATCH 13/69] dm raid: stop keeping raid set frozen altogether

In order to avoid redoing synchronization/recovery/reshape partially,
the raid set got frozen until after all passed in table line flags had
been cleared.  The related table reload sequence had to be precisely
followed, or reshaping may lead to data corruption caused by the active
mapping carrying on with a reshape when the inactive mapping already
had retrieved a stale reshape position.

Harden by retrieving the actual resync/recovery/reshape position
during resume whilst the active table is suspended thus avoiding
to keep the raid set frozen altogether.  This prevents superfluous
redoing of an already resynchronized or recovered segment and,
most importantly, potential for redoing of an already reshaped
segment causing data corruption.

Fixes: d39f0010e ("dm raid: fix raid_resume() to keep raid set frozen as needed")
Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |   1 +
 drivers/md/dm-raid.c                    | 108 +++++++++++++++---------
 2 files changed, 71 insertions(+), 38 deletions(-)

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 7b22375091fa..390c145f01d7 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -347,3 +347,4 @@ Version History
 1.13.0  Fix dev_health status at end of "recover" (was 'a', now 'A')
 1.13.1  Fix deadlock caused by early md_stop_writes().  Also fix size an
 	state races.
+1.13.2  Fix raid redundancy validation and avoid keeping raid set frozen
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b82b7095a671..109b001407a8 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -29,6 +29,9 @@
  */
 #define	MIN_RAID456_JOURNAL_SPACE (4*2048)
 
+/* Global list of all raid sets */
+LIST_HEAD(raid_sets);
+
 static bool devices_handle_discard_safely = false;
 
 /*
@@ -105,8 +108,6 @@ struct raid_dev {
 #define CTR_FLAG_JOURNAL_DEV		(1 << __CTR_FLAG_JOURNAL_DEV)
 #define CTR_FLAG_JOURNAL_MODE		(1 << __CTR_FLAG_JOURNAL_MODE)
 
-#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
-
 /*
  * Definitions of various constructor flags to
  * be used in checks of valid / invalid flags
@@ -226,6 +227,7 @@ struct rs_layout {
 
 struct raid_set {
 	struct dm_target *ti;
+	struct list_head list;
 
 	uint32_t stripe_cache_entries;
 	unsigned long ctr_flags;
@@ -271,6 +273,19 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
 	mddev->new_chunk_sectors = l->new_chunk_sectors;
 }
 
+/* Find any raid_set in active slot for @rs on global list */
+static struct raid_set *rs_find_active(struct raid_set *rs)
+{
+	struct raid_set *r;
+	struct mapped_device *md = dm_table_get_md(rs->ti->table);
+
+	list_for_each_entry(r, &raid_sets, list)
+		if (r != rs && dm_table_get_md(r->ti->table) == md)
+			return r;
+
+	return NULL;
+}
+
 /* raid10 algorithms (i.e. formats) */
 #define	ALGORITHM_RAID10_DEFAULT	0
 #define	ALGORITHM_RAID10_NEAR		1
@@ -749,6 +764,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 
 	mddev_init(&rs->md);
 
+	INIT_LIST_HEAD(&rs->list);
 	rs->raid_disks = raid_devs;
 	rs->delta_disks = 0;
 
@@ -766,6 +782,9 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 	for (i = 0; i < raid_devs; i++)
 		md_rdev_init(&rs->dev[i].rdev);
 
+	/* Add @rs to global list. */
+	list_add(&rs->list, &raid_sets);
+
 	/*
 	 * Remaining items to be initialized by further RAID params:
 	 *  rs->md.persistent
@@ -778,6 +797,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 	return rs;
 }
 
+/* Free all @rs allocations and remove it from global list. */
 static void raid_set_free(struct raid_set *rs)
 {
 	int i;
@@ -795,6 +815,8 @@ static void raid_set_free(struct raid_set *rs)
 			dm_put_device(rs->ti, rs->dev[i].data_dev);
 	}
 
+	list_del(&rs->list);
+
 	kfree(rs);
 }
 
@@ -2371,7 +2393,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 			DMERR("new device%s provided without 'rebuild'",
 			      new_devs > 1 ? "s" : "");
 			return -EINVAL;
-		} else if (rs_is_recovering(rs)) {
+		} else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) {
 			DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
 			      (unsigned long long) mddev->recovery_cp);
 			return -EINVAL;
@@ -3173,19 +3195,22 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			goto bad;
 		}
 
-		/*
-		  * We can only prepare for a reshape here, because the
-		  * raid set needs to run to provide the repective reshape
-		  * check functions via its MD personality instance.
-		  *
-		  * So do the reshape check after md_run() succeeded.
-		  */
-		r = rs_prepare_reshape(rs);
-		if (r)
-			return r;
+		/* Out-of-place space has to be available to allow for a reshape unless raid1! */
+		if (reshape_sectors || rs_is_raid1(rs)) {
+			/*
+			  * We can only prepare for a reshape here, because the
+			  * raid set needs to run to provide the repective reshape
+			  * check functions via its MD personality instance.
+			  *
+			  * So do the reshape check after md_run() succeeded.
+			  */
+			r = rs_prepare_reshape(rs);
+			if (r)
+				return r;
 
-		/* Reshaping ain't recovery, so disable recovery */
-		rs_setup_recovery(rs, MaxSector);
+			/* Reshaping ain't recovery, so disable recovery */
+			rs_setup_recovery(rs, MaxSector);
+		}
 		rs_set_cur(rs);
 	} else {
 		/* May not set recovery when a device rebuild is requested */
@@ -3395,7 +3420,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 		} else if (test_bit(MD_RECOVERY_NEEDED, &recovery) ||
 			   test_bit(MD_RECOVERY_RUNNING, &recovery))
 			r = mddev->curr_resync_completed;
-
 		else
 			r = mddev->recovery_cp;
 
@@ -3904,10 +3928,33 @@ static int raid_preresume(struct dm_target *ti)
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;
 
-	/* This is a resume after a suspend of the set -> it's already started */
+	/* This is a resume after a suspend of the set -> it's already started. */
 	if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
 		return 0;
 
+	if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
+		struct raid_set *rs_active = rs_find_active(rs);
+
+		if (rs_active) {
+			/*
+			 * In case no rebuilds have been requested
+			 * and an active table slot exists, copy
+			 * current resynchonization completed and
+			 * reshape position pointers across from
+			 * suspended raid set in the active slot.
+			 *
+			 * This resumes the new mapping at current
+			 * offsets to continue recover/reshape without
+			 * necessarily redoing a raid set partially or
+			 * causing data corruption in case of a reshape.
+			 */
+			if (rs_active->md.curr_resync_completed != MaxSector)
+				mddev->curr_resync_completed = rs_active->md.curr_resync_completed;
+			if (rs_active->md.reshape_position != MaxSector)
+				mddev->reshape_position = rs_active->md.reshape_position;
+		}
+	}
+
 	/*
 	 * The superblocks need to be updated on disk if the
 	 * array is new or new devices got added (thus zeroed
@@ -3968,28 +4015,13 @@ static void raid_resume(struct dm_target *ti)
 		attempt_restore_of_faulty_devices(rs);
 	}
 
-	/* Only reduce raid set size before running a disk removing reshape. */
-	if (mddev->delta_disks < 0)
-		rs_set_capacity(rs);
-
-	/*
-	 * Keep the RAID set frozen if reshape/rebuild flags are set.
-	 * The RAID set is unfrozen once the next table load/resume,
-	 * which clears the reshape/rebuild flags, occurs.
-	 * This ensures that the constructor for the inactive table
-	 * retrieves an up-to-date reshape_position.
-	 */
-	if (!test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags) &&
-	    !(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) {
-		if (rs_is_reshapable(rs)) {
-			if (!rs_is_reshaping(rs) || _get_reshape_sectors(rs))
-				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-		} else
-			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-	}
-
 	if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
+		/* Only reduce raid set size before running a disk removing reshape. */
+		if (mddev->delta_disks < 0)
+			rs_set_capacity(rs);
+
 		mddev_lock_nointr(mddev);
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		mddev->ro = 0;
 		mddev->in_sync = 0;
 		mddev_resume(mddev);
@@ -3999,7 +4031,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 13, 1},
+	.version = {1, 13, 2},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,

From dc15b943d4651bc13b9737bb27283ad9d3b8eeba Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 13 Dec 2017 17:13:19 +0100
Subject: [PATCH 14/69] dm raid: ensure 'a' chars during reshape

During reshape, 'A' chars were reported in status rather than 'a'.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 109b001407a8..af4f40de2c0b 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3451,6 +3451,15 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 			 */
 			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
 
+		} else if (test_bit(MD_RECOVERY_RESHAPE, &recovery) &&
+			   !test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
+			/*
+			 * If "reshape" is occurring, the raid set
+			 * is or may be out of sync hence the health
+			 * characters shall be 'a'.
+			 */
+			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+
 		} else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
 			/*
 			 * If "check" or "repair" is occurring, the raid set has

From 7c29744eccecc2c74c9b4d1ea0a60b4d95229399 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 13 Dec 2017 17:13:20 +0100
Subject: [PATCH 15/69] dm raid: simplify rs_get_progress()

No need to calculate the reshaping progress because
mddev->curr_resync_completed holds it.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index af4f40de2c0b..21e007c89c2e 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3399,26 +3399,9 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 		set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 
 	} else {
-		/* Reshape is relative to the array size */
-		if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) {
-			r = mddev->reshape_position;
-			if (r != MaxSector) {
-				/* Got to reverse on backward reshape */
-				if (mddev->reshape_backwards)
-					r = mddev->array_sectors - r;
-
-				/* Divide by # of data stripes unless raid1 */
-				if (!rs_is_raid1(rs))
-					sector_div(r, mddev_data_stripes(rs));
-			}
-
-		/*
-		 * Sync/recover is relative to the component device size.
-		 *
-		 * MD_RECOVERY_NEEDED for https://bugzilla.redhat.com/show_bug.cgi?id=1508070
-		 */
-		} else if (test_bit(MD_RECOVERY_NEEDED, &recovery) ||
-			   test_bit(MD_RECOVERY_RUNNING, &recovery))
+		if (test_bit(MD_RECOVERY_NEEDED, &recovery) ||
+		    test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
+		    test_bit(MD_RECOVERY_RUNNING, &recovery))
 			r = mddev->curr_resync_completed;
 		else
 			r = mddev->recovery_cp;

From 552aa679f265743163fb440c61370a9c51f66c81 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 13 Dec 2017 17:13:21 +0100
Subject: [PATCH 16/69] dm raid: use rs_is_raid*()

Cleanup, no functional change.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 21e007c89c2e..7d7dc1723180 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -588,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout)
 }
 
 /* Return md raid10 algorithm for @name */
-static int raid10_name_to_format(const char *name)
+static const int raid10_name_to_format(const char *name)
 {
 	if (!strcasecmp(name, "near"))
 		return ALGORITHM_RAID10_NEAR;
@@ -1913,7 +1913,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
 	if (rs_takeover_requested(rs))
 		return false;
 
-	if (!mddev->level)
+	if (rs_is_raid0(rs))
 		return false;
 
 	change = mddev->new_layout != mddev->layout ||
@@ -1921,7 +1921,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
 		 rs->delta_disks;
 
 	/* Historical case to support raid1 reshape without delta disks */
-	if (mddev->level == 1) {
+	if (rs_is_raid1(rs)) {
 		if (rs->delta_disks)
 			return !!rs->delta_disks;
 
@@ -1929,7 +1929,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
 		       mddev->raid_disks != rs->raid_disks;
 	}
 
-	if (mddev->level == 10)
+	if (rs_is_raid10(rs))
 		return change &&
 		       !__is_raid10_far(mddev->new_layout) &&
 		       rs->delta_disks >= 0;
@@ -2742,14 +2742,14 @@ static int rs_setup_takeover(struct raid_set *rs)
 	sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset;
 
 	if (rt_is_raid10(rs->raid_type)) {
-		if (mddev->level == 0) {
+		if (rs_is_raid0(rs)) {
 			/* Userpace reordered disks -> adjust raid_disk indexes */
 			__reorder_raid_disk_indexes(rs);
 
 			/* raid0 -> raid10_far layout */
 			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR,
 								   rs->raid10_copies);
-		} else if (mddev->level == 1)
+		} else if (rs_is_raid1(rs))
 			/* raid1 -> raid10_near layout */
 			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
 								   rs->raid_disks);
@@ -2977,7 +2977,7 @@ static void configure_discard_support(struct raid_set *rs)
 	/*
 	 * XXX: RAID level 4,5,6 require zeroing for safety.
 	 */
-	raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
+	raid456 = rs_is_raid456(rs);
 
 	for (i = 0; i < rs->raid_disks; i++) {
 		struct request_queue *q;
@@ -3002,7 +3002,7 @@ static void configure_discard_support(struct raid_set *rs)
 	 * RAID1 and RAID10 personalities require bio splitting,
 	 * RAID0/4/5/6 don't and process large discard bios properly.
 	 */
-	ti->split_discard_bios = !!(rs->md.level == 1 || rs->md.level == 10);
+	ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs));
 	ti->num_discard_bios = 1;
 }
 

From c06b3e583750fa8f1d214ca50c86d936f6a329c6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 21 Nov 2017 08:44:35 -0500
Subject: [PATCH 17/69] dm: fix comment above dm_accept_partial_bio

Clarify that dm_accept_partial_bio isn't allowed for REQ_OP_ZONE_RESET
bios.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index de17b7193299..fb9e6d808170 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -997,7 +997,7 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
- * allowed for all bio types except REQ_PREFLUSH.
+ * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
  *
  * dm_accept_partial_bio informs the dm that the target only wants to process
  * additional n_sectors sectors of the bio and the rest of the data should be

From 80cd17578310dbaf880ae0db9240ad2218c5811a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 30 Aug 2017 08:10:18 +1000
Subject: [PATCH 18/69] dm crypt: remove BIOSET_NEED_RESCUER flag

The BIOSET_NEED_RESCUER flag is only needed when a make_request_fn might
do two allocations from the one bioset, and the second one could block
until the first bio completes.

dm-crypt does allocate from this bioset inside the dm make_request_fn,
but does so using GFP_NOWAIT so that the allocation will not block.

So BIOSET_NEED_RESCUER is not needed.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9fc12f556534..9c53367d2f3e 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2697,8 +2697,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, 0, (BIOSET_NEED_BVECS |
-					    BIOSET_NEED_RESCUER));
+	cc->bs = bioset_create(MIN_IOS, 0, BIOSET_NEED_BVECS);
 	if (!cc->bs) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad;

From c110a4b6e603ece6134fe436e84957f7a4cd099e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 30 Aug 2017 08:10:18 +1000
Subject: [PATCH 19/69] dm io: remove BIOSET_NEED_RESCUER flag from bios bioset

The BIOSET_NEED_RESCUER flag is only needed when a make_request_fn might
do two allocations from the one bioset, and the second one could block
until the first bio completes.

dm_io() is called from make_request_fn() context.  The closest it comes
to multiple allocations is in chunk_io() in dm-snap-persistent.  But
there the code uses a separate thread to avoid problems.

So BIOSET_NEED_RESCUER is not needed.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b4357ed4d541..a8d914d5abbe 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -58,8 +58,7 @@ struct dm_io_client *dm_io_client_create(void)
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(min_ios, 0, (BIOSET_NEED_BVECS |
-						  BIOSET_NEED_RESCUER));
+	client->bios = bioset_create(min_ios, 0, BIOSET_NEED_BVECS);
 	if (!client->bios)
 		goto bad;
 

From 18a25da84354c6bb655320de6072c00eda6eb602 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 6 Sep 2017 09:43:28 +1000
Subject: [PATCH 20/69] dm: ensure bio submission follows a depth-first tree
 walk

A dm device can, in general, represent a tree of targets, each of which
handles a sub-range of the range of blocks handled by the parent.

The bio sequencing managed by generic_make_request() requires that bios
are generated and handled in a depth-first manner.  Each call to a
make_request_fn() may submit bios to a single member device, and may
submit bios for a reduced region of the same device as the
make_request_fn.

In particular, any bios submitted to member devices must be expected to
be processed in order, so a later one must never wait for an earlier
one.

This ordering is usually achieved by using bio_split() to reduce a bio
to a size that can be completely handled by one target, and resubmitting
the remainder to the originating device. bio_queue_split() shows the
canonical approach.

dm doesn't follow this approach, largely because it has needed to split
bios since long before bio_split() was available.  It currently can
submit bios to separate targets within the one dm_make_request() call.
Dependencies between these targets, as can happen with dm-snap, can
cause deadlocks if either bios gets stuck behind the other in the queues
managed by generic_make_request().  This requires the 'rescue'
functionality provided by dm_offload_{start,end}.

Some of this requirement can be removed by changing the order of bio
submission to follow the canonical approach.  That is, if dm finds that
it needs to split a bio, the remainder should be sent to
generic_make_request() rather than being handled immediately.  This
delays the handling until the first part is completely processed, so the
deadlock problems do not occur.

__split_and_process_bio() can be called both from dm_make_request() and
from dm_wq_work().  When called from dm_wq_work() the current approach
is perfectly satisfactory as each bio will be processed immediately.
When called from dm_make_request(), current->bio_list will be non-NULL,
and in this case it is best to create a separate "clone" bio for the
remainder.

When we use bio_clone_bioset() to split off the front part of a bio
and chain the two together and submit the remainder to
generic_make_request(), it is important that the newly allocated
bio is used as the head to be processed immediately, and the original
bio gets "bio_advance()"d and sent to generic_make_request() as the
remainder.  Otherwise, if the newly allocated bio is used as the
remainder, and if it then needs to be split again, then the next
bio_clone_bioset() call will be made while holding a reference a bio
(result of the first clone) from the same bioset.  This can potentially
exhaust the bioset mempool and result in a memory allocation deadlock.

Note that there is no race caused by reassigning cio.io->bio after already
calling __map_bio().  This bio will only be dereferenced again after
dec_pending() has found io->io_count to be zero, and this cannot happen
before the dec_pending() call at the end of __split_and_process_bio().

To provide the clone bio when splitting, we use q->bio_split.  This
was previously being freed by bio-based dm to avoid having excess
rescuer threads.  As bio_split bio sets no longer create rescuer
threads, there is little cost and much gain from restoring the
q->bio_split bio set.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fb9e6d808170..07dec8ece083 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1498,8 +1498,29 @@ static void __split_and_process_bio(struct mapped_device *md,
 	} else {
 		ci.bio = bio;
 		ci.sector_count = bio_sectors(bio);
-		while (ci.sector_count && !error)
+		while (ci.sector_count && !error) {
 			error = __split_and_process_non_flush(&ci);
+			if (current->bio_list && ci.sector_count && !error) {
+				/*
+				 * Remainder must be passed to generic_make_request()
+				 * so that it gets handled *after* bios already submitted
+				 * have been completely processed.
+				 * We take a clone of the original to store in
+				 * ci.io->bio to be used by end_io_acct() and
+				 * for dec_pending to use for completion handling.
+				 * As this path is not used for REQ_OP_ZONE_REPORT,
+				 * the usage of io->bio in dm_remap_zone_report()
+				 * won't be affected by this reassignment.
+				 */
+				struct bio *b = bio_clone_bioset(bio, GFP_NOIO,
+								 md->queue->bio_split);
+				ci.io->bio = b;
+				bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
+				bio_chain(b, bio);
+				generic_make_request(bio);
+				break;
+			}
+		}
 	}
 
 	/* drop the extra reference count */
@@ -1510,8 +1531,8 @@ static void __split_and_process_bio(struct mapped_device *md,
  *---------------------------------------------------------------*/
 
 /*
- * The request function that just remaps the bio built up by
- * dm_merge_bvec.
+ * The request function that remaps the bio to one target and
+ * splits off any remainder.
  */
 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
 {
@@ -2034,12 +2055,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 	case DM_TYPE_DAX_BIO_BASED:
 		dm_init_normal_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
-		/*
-		 * DM handles splitting bios as needed.  Free the bio_split bioset
-		 * since it won't be used (saves 1 process per bio-based DM device).
-		 */
-		bioset_free(md->queue->bio_split);
-		md->queue->bio_split = NULL;
 
 		if (type == DM_TYPE_DAX_BIO_BASED)
 			queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);

From f31c21e4365c02ccf7226c33ea978cd5dbfc351e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 22 Nov 2017 14:25:18 +1100
Subject: [PATCH 21/69] dm: remove unused 'num_write_bios' target interface

No DM target provides num_write_bios and none has since dm-cache's
brief use in 2013.

Having the possibility of num_write_bios > 1 complicates bio
allocation.  So remove the interface and assume there is only one bio
needed.

If a target ever needs more, it must provide a suitable bioset and
allocate itself based on its particular needs.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c               | 30 ++++++++++--------------------
 include/linux/device-mapper.h | 15 ---------------
 2 files changed, 10 insertions(+), 35 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 07dec8ece083..2480c6abe8f1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1319,32 +1319,22 @@ static int __send_empty_flush(struct clone_info *ci)
 }
 
 static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
-				     sector_t sector, unsigned *len)
+				    sector_t sector, unsigned *len)
 {
 	struct bio *bio = ci->bio;
 	struct dm_target_io *tio;
-	unsigned target_bio_nr;
-	unsigned num_target_bios = 1;
-	int r = 0;
+	int r;
 
-	/*
-	 * Does the target want to receive duplicate copies of the bio?
-	 */
-	if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
-		num_target_bios = ti->num_write_bios(ti, bio);
-
-	for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
-		tio = alloc_tio(ci, ti, target_bio_nr);
-		tio->len_ptr = len;
-		r = clone_bio(tio, bio, sector, *len);
-		if (r < 0) {
-			free_tio(tio);
-			break;
-		}
-		__map_bio(tio);
+	tio = alloc_tio(ci, ti, 0);
+	tio->len_ptr = len;
+	r = clone_bio(tio, bio, sector, *len);
+	if (r < 0) {
+		free_tio(tio);
+		return r;
 	}
+	__map_bio(tio);
 
-	return r;
+	return 0;
 }
 
 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a5538433c927..5a68b366e664 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -220,14 +220,6 @@ struct target_type {
 #define DM_TARGET_WILDCARD		0x00000008
 #define dm_target_is_wildcard(type)	((type)->features & DM_TARGET_WILDCARD)
 
-/*
- * Some targets need to be sent the same WRITE bio severals times so
- * that they can send copies of it to different devices.  This function
- * examines any supplied bio and returns the number of copies of it the
- * target requires.
- */
-typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio);
-
 /*
  * A target implements own bio data integrity.
  */
@@ -291,13 +283,6 @@ struct dm_target {
 	 */
 	unsigned per_io_data_size;
 
-	/*
-	 * If defined, this function is called to find out how many
-	 * duplicate bios should be sent to the target when writing
-	 * data.
-	 */
-	dm_num_write_bios_fn num_write_bios;
-
 	/* target specific data */
 	void *private;
 

From 318716ddea0829d3be566efc69d31029c40d51e2 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 22 Nov 2017 14:56:12 -0500
Subject: [PATCH 22/69] dm: safely allocate multiple bioset bios

DM targets can request multiple bios be sent to them by DM core (see:
num_{flush,discard,write_same,write_zeroes}_bios).  But until now these
bios were allocated in an unsafe manner than could potentially exhaust
the DM device's bioset -- in the face of multiple threads each trying to
do multiple allocations from the same DM device's bioset.

Fix __send_duplicate_bios() by using the new alloc_multiple_bios().  The
allocation strategy used by alloc_multiple_bios() models that used by
dm-crypt.c:crypt_alloc_buffer().

Neil Brown initially proposed this fix but the implementation has been
revised enough that it inappropriate to attribute the entirety of it to
him.

Suggested-by: NeilBrown <neilb@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 71 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 58 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2480c6abe8f1..79b8f072e76a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1264,16 +1264,17 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 	return 0;
 }
 
-static struct dm_target_io *alloc_tio(struct clone_info *ci,
-				      struct dm_target *ti,
-				      unsigned target_bio_nr)
+static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
+				      unsigned target_bio_nr, gfp_t gfp_mask)
 {
 	struct dm_target_io *tio;
 	struct bio *clone;
 
-	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
-	tio = container_of(clone, struct dm_target_io, clone);
+	clone = bio_alloc_bioset(gfp_mask, 0, ci->md->bs);
+	if (!clone)
+		return NULL;
 
+	tio = container_of(clone, struct dm_target_io, clone);
 	tio->io = ci->io;
 	tio->ti = ti;
 	tio->target_bio_nr = target_bio_nr;
@@ -1281,11 +1282,49 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
 	return tio;
 }
 
-static void __clone_and_map_simple_bio(struct clone_info *ci,
-				       struct dm_target *ti,
-				       unsigned target_bio_nr, unsigned *len)
+static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
+				struct dm_target *ti, unsigned num_bios)
+{
+	struct dm_target_io *tio;
+	int try;
+
+	if (!num_bios)
+		return;
+
+	if (num_bios == 1) {
+		tio = alloc_tio(ci, ti, 0, GFP_NOIO);
+		bio_list_add(blist, &tio->clone);
+		return;
+	}
+
+	for (try = 0; try < 2; try++) {
+		int bio_nr;
+		struct bio *bio;
+
+		if (try)
+			mutex_lock(&ci->md->table_devices_lock);
+		for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
+			tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
+			if (!tio)
+				break;
+
+			bio_list_add(blist, &tio->clone);
+		}
+		if (try)
+			mutex_unlock(&ci->md->table_devices_lock);
+		if (bio_nr == num_bios)
+			return;
+
+		while ((bio = bio_list_pop(blist))) {
+			tio = container_of(bio, struct dm_target_io, clone);
+			free_tio(tio);
+		}
+	}
+}
+
+static void __clone_and_map_simple_bio(struct clone_info *ci,
+				       struct dm_target_io *tio, unsigned *len)
 {
-	struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
 	struct bio *clone = &tio->clone;
 
 	tio->len_ptr = len;
@@ -1300,10 +1339,16 @@ static void __clone_and_map_simple_bio(struct clone_info *ci,
 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
 				  unsigned num_bios, unsigned *len)
 {
-	unsigned target_bio_nr;
+	struct bio_list blist = BIO_EMPTY_LIST;
+	struct bio *bio;
+	struct dm_target_io *tio;
 
-	for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
-		__clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
+	alloc_multiple_bios(&blist, ci, ti, num_bios);
+
+	while ((bio = bio_list_pop(&blist))) {
+		tio = container_of(bio, struct dm_target_io, clone);
+		__clone_and_map_simple_bio(ci, tio, len);
+	}
 }
 
 static int __send_empty_flush(struct clone_info *ci)
@@ -1325,7 +1370,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 	struct dm_target_io *tio;
 	int r;
 
-	tio = alloc_tio(ci, ti, 0);
+	tio = alloc_tio(ci, ti, 0, GFP_NOIO);
 	tio->len_ptr = len;
 	r = clone_bio(tio, bio, sector, *len);
 	if (r < 0) {

From 4a3f54d94d5c39ee0a78a76120f363bc0cda02c9 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 22 Nov 2017 15:37:43 -0500
Subject: [PATCH 23/69] dm: remove BIOSET_NEED_RESCUER based dm_offload
 infrastructure

Now that all of DM has been revised and/or verified to no longer require
the use of BIOSET_NEED_RESCUER the dm_offload code may be removed.

Suggested-by: NeilBrown <neilb@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 60 +------------------------------------------------
 1 file changed, 1 insertion(+), 59 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 79b8f072e76a..2e0e10a1c030 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1114,65 +1114,10 @@ void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
 }
 EXPORT_SYMBOL_GPL(dm_remap_zone_report);
 
-/*
- * Flush current->bio_list when the target map method blocks.
- * This fixes deadlocks in snapshot and possibly in other targets.
- */
-struct dm_offload {
-	struct blk_plug plug;
-	struct blk_plug_cb cb;
-};
-
-static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
-{
-	struct dm_offload *o = container_of(cb, struct dm_offload, cb);
-	struct bio_list list;
-	struct bio *bio;
-	int i;
-
-	INIT_LIST_HEAD(&o->cb.list);
-
-	if (unlikely(!current->bio_list))
-		return;
-
-	for (i = 0; i < 2; i++) {
-		list = current->bio_list[i];
-		bio_list_init(&current->bio_list[i]);
-
-		while ((bio = bio_list_pop(&list))) {
-			struct bio_set *bs = bio->bi_pool;
-			if (unlikely(!bs) || bs == fs_bio_set ||
-			    !bs->rescue_workqueue) {
-				bio_list_add(&current->bio_list[i], bio);
-				continue;
-			}
-
-			spin_lock(&bs->rescue_lock);
-			bio_list_add(&bs->rescue_list, bio);
-			queue_work(bs->rescue_workqueue, &bs->rescue_work);
-			spin_unlock(&bs->rescue_lock);
-		}
-	}
-}
-
-static void dm_offload_start(struct dm_offload *o)
-{
-	blk_start_plug(&o->plug);
-	o->cb.callback = flush_current_bio_list;
-	list_add(&o->cb.list, &current->plug->cb_list);
-}
-
-static void dm_offload_end(struct dm_offload *o)
-{
-	list_del(&o->cb.list);
-	blk_finish_plug(&o->plug);
-}
-
 static void __map_bio(struct dm_target_io *tio)
 {
 	int r;
 	sector_t sector;
-	struct dm_offload o;
 	struct bio *clone = &tio->clone;
 	struct dm_target *ti = tio->ti;
 
@@ -1186,10 +1131,7 @@ static void __map_bio(struct dm_target_io *tio)
 	atomic_inc(&tio->io->io_count);
 	sector = clone->bi_iter.bi_sector;
 
-	dm_offload_start(&o);
 	r = ti->type->map(ti, clone);
-	dm_offload_end(&o);
-
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
 		break;
@@ -2814,7 +2756,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 		BUG();
 	}
 
-	pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER);
+	pools->bs = bioset_create(pool_size, front_pad, 0);
 	if (!pools->bs)
 		goto out;
 

From 0776aa0e30aa31b2fad606457e9d3faf39d88314 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 8 Dec 2017 14:40:52 -0500
Subject: [PATCH 24/69] dm: ensure bio-based DM's bioset and io_pool support
 targets' maximum IOs

alloc_multiple_bios() assumes it can allocate the requested number of
bios but until now there was no gaurantee that the mempools would be
accomodating.

Suggested-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 11 +++++++----
 drivers/md/dm.c       | 28 ++++++++++++++++++----------
 drivers/md/dm.h       |  3 ++-
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index aaffd0c0ee9a..7b22cc8d30f4 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1079,7 +1079,8 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 {
 	enum dm_queue_mode type = dm_table_get_type(t);
 	unsigned per_io_data_size = 0;
-	struct dm_target *tgt;
+	unsigned min_pool_size = 0;
+	struct dm_target *ti;
 	unsigned i;
 
 	if (unlikely(type == DM_TYPE_NONE)) {
@@ -1089,11 +1090,13 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 
 	if (__table_type_bio_based(type))
 		for (i = 0; i < t->num_targets; i++) {
-			tgt = t->targets + i;
-			per_io_data_size = max(per_io_data_size, tgt->per_io_data_size);
+			ti = t->targets + i;
+			per_io_data_size = max(per_io_data_size, ti->per_io_data_size);
+			min_pool_size = max(min_pool_size, ti->num_flush_bios);
 		}
 
-	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_io_data_size);
+	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported,
+					   per_io_data_size, min_pool_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2e0e10a1c030..9d255e5c9688 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1810,17 +1810,26 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-	if (md->bs) {
-		/* The md already has necessary mempools. */
-		if (dm_table_bio_based(t)) {
+	if (dm_table_bio_based(t)) {
+		/* The md may already have mempools that need changing. */
+		if (md->bs) {
 			/*
 			 * Reload bioset because front_pad may have changed
 			 * because a different table was loaded.
 			 */
 			bioset_free(md->bs);
-			md->bs = p->bs;
-			p->bs = NULL;
+			md->bs = NULL;
 		}
+		if (md->io_pool) {
+			/*
+			 * Reload io_pool because pool_size may have changed
+			 * because a different table was loaded.
+			 */
+			mempool_destroy(md->io_pool);
+			md->io_pool = NULL;
+		}
+
+	} else if (md->bs) {
 		/*
 		 * There's no need to reload with request-based dm
 		 * because the size of front_pad doesn't change.
@@ -1838,7 +1847,6 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 	p->io_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
-
 out:
 	/* mempool bind completed, no longer need any mempools in the table */
 	dm_table_free_md_mempools(t);
@@ -2727,7 +2735,8 @@ int dm_noflush_suspending(struct dm_target *ti)
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
-					    unsigned integrity, unsigned per_io_data_size)
+					    unsigned integrity, unsigned per_io_data_size,
+					    unsigned min_pool_size)
 {
 	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
 	unsigned int pool_size = 0;
@@ -2739,16 +2748,15 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
 	case DM_TYPE_DAX_BIO_BASED:
-		pool_size = dm_get_reserved_bio_based_ios();
+		pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
 		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
-	
 		pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
 		if (!pools->io_pool)
 			goto out;
 		break;
 	case DM_TYPE_REQUEST_BASED:
 	case DM_TYPE_MQ_REQUEST_BASED:
-		pool_size = dm_get_reserved_rq_based_ios();
+		pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_io_data_size is used for blk-mq pdu at queue allocation */
 		break;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 36399bb875dd..7c66c316add3 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -206,7 +206,8 @@ void dm_kcopyd_exit(void);
  * Mempool operations
  */
 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
-					    unsigned integrity, unsigned per_bio_data_size);
+					    unsigned integrity, unsigned per_bio_data_size,
+					    unsigned min_pool_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*

From 3d7f45625a84696f61c6470a887bdc65180937a9 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 8 Dec 2017 15:02:11 -0500
Subject: [PATCH 25/69] dm: fix __send_changing_extent_only() to send first bio
 and chain remainder

__send_changing_extent_only() must follow the same pattern that was
established with commit "dm: ensure bio submission follows a depth-first
tree walk".  That is: submit first bio up to split boundary and then
split the remainder to further submissions.

Suggested-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 66 ++++++++++++++++++++++---------------------------
 1 file changed, 30 insertions(+), 36 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9d255e5c9688..4e7682afebfa 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1348,56 +1348,50 @@ static bool is_split_required_for_discard(struct dm_target *ti)
 	return ti->split_discard_bios;
 }
 
-static int __send_changing_extent_only(struct clone_info *ci,
+static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
 				       get_num_bios_fn get_num_bios,
 				       is_split_required_fn is_split_required)
 {
-	struct dm_target *ti;
 	unsigned len;
 	unsigned num_bios;
 
-	do {
-		ti = dm_table_find_target(ci->map, ci->sector);
-		if (!dm_target_is_valid(ti))
-			return -EIO;
+	/*
+	 * Even though the device advertised support for this type of
+	 * request, that does not mean every target supports it, and
+	 * reconfiguration might also have changed that since the
+	 * check was performed.
+	 */
+	num_bios = get_num_bios ? get_num_bios(ti) : 0;
+	if (!num_bios)
+		return -EOPNOTSUPP;
 
-		/*
-		 * Even though the device advertised support for this type of
-		 * request, that does not mean every target supports it, and
-		 * reconfiguration might also have changed that since the
-		 * check was performed.
-		 */
-		num_bios = get_num_bios ? get_num_bios(ti) : 0;
-		if (!num_bios)
-			return -EOPNOTSUPP;
+	if (is_split_required && !is_split_required(ti))
+		len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+	else
+		len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
 
-		if (is_split_required && !is_split_required(ti))
-			len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
-		else
-			len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
+	__send_duplicate_bios(ci, ti, num_bios, &len);
 
-		__send_duplicate_bios(ci, ti, num_bios, &len);
-
-		ci->sector += len;
-	} while (ci->sector_count -= len);
+	ci->sector += len;
+	ci->sector_count -= len;
 
 	return 0;
 }
 
-static int __send_discard(struct clone_info *ci)
+static int __send_discard(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, get_num_discard_bios,
+	return __send_changing_extent_only(ci, ti, get_num_discard_bios,
 					   is_split_required_for_discard);
 }
 
-static int __send_write_same(struct clone_info *ci)
+static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
+	return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
 }
 
-static int __send_write_zeroes(struct clone_info *ci)
+static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+	return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
 }
 
 /*
@@ -1410,17 +1404,17 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 	unsigned len;
 	int r;
 
-	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
-		return __send_discard(ci);
-	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
-		return __send_write_same(ci);
-	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
-		return __send_write_zeroes(ci);
-
 	ti = dm_table_find_target(ci->map, ci->sector);
 	if (!dm_target_is_valid(ti))
 		return -EIO;
 
+	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
+		return __send_discard(ci, ti);
+	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
+		return __send_write_same(ci, ti);
+	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+		return __send_write_zeroes(ci, ti);
+
 	if (bio_op(bio) == REQ_OP_ZONE_REPORT)
 		len = ci->sector_count;
 	else

From ad3793fc3945173f64d82d05d3ecde41f6c0435c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 4 Dec 2017 23:28:32 -0500
Subject: [PATCH 26/69] dm: set QUEUE_FLAG_DAX accordingly in
 dm_table_set_restrictions()

Rather than having DAX support be unique by setting it based on table
type in dm_setup_md_queue().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 2 ++
 drivers/md/dm.c       | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 7b22cc8d30f4..504e79bc3a55 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1823,6 +1823,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	}
 	blk_queue_write_cache(q, wc, fua);
 
+	if (dm_table_supports_dax(t))
+		queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
 	if (dm_table_supports_dax_write_cache(t))
 		dax_write_cache(t->md->dax_dev, true);
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4e7682afebfa..308d178fff73 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2034,9 +2034,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 	case DM_TYPE_DAX_BIO_BASED:
 		dm_init_normal_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
-
-		if (type == DM_TYPE_DAX_BIO_BASED)
-			queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
 		break;
 	case DM_TYPE_NONE:
 		WARN_ON_ONCE(true);

From 2abf1fc91d8139fa4f8b19b06f649af191224242 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sat, 9 Dec 2017 20:38:16 -0500
Subject: [PATCH 27/69] dm: remove stale comment blocks

These CRUD comments have worn out their welcome.  The code is what it
is, over time it'll hopefully get better.  But these comments serve no
purpose whatsoever.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 308d178fff73..5827e1641ba2 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -752,15 +752,6 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 	return 0;
 }
 
-/*-----------------------------------------------------------------
- * CRUD START:
- *   A more elegant soln is in the works that uses the queue
- *   merge fn, unfortunately there are a couple of changes to
- *   the block layer that I want to make for this.  So in the
- *   interests of getting something for people to use I give
- *   you this clearly demarcated crap.
- *---------------------------------------------------------------*/
-
 static int __noflush_suspending(struct mapped_device *md)
 {
 	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
@@ -1497,9 +1488,6 @@ static void __split_and_process_bio(struct mapped_device *md,
 	/* drop the extra reference count */
 	dec_pending(ci.io, errno_to_blk_status(error));
 }
-/*-----------------------------------------------------------------
- * CRUD END
- *---------------------------------------------------------------*/
 
 /*
  * The request function that remaps the bio to one target and

From 745dc570b2c379730d2a78acdeb65b5239e833c6 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 11 Dec 2017 20:51:50 -0500
Subject: [PATCH 28/69] dm: rename 'bio' member of dm_io structure to
 'orig_bio'

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5827e1641ba2..3e3fbc6f708f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -60,13 +60,13 @@ void dm_issue_global_event(void)
 }
 
 /*
- * One of these is allocated per bio.
+ * One of these is allocated per original bio.
  */
 struct dm_io {
 	struct mapped_device *md;
 	blk_status_t status;
 	atomic_t io_count;
-	struct bio *bio;
+	struct bio *orig_bio;
 	unsigned long start_time;
 	spinlock_t endio_lock;
 	struct dm_stats_aux stats_aux;
@@ -510,7 +510,7 @@ int md_in_flight(struct mapped_device *md)
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
-	struct bio *bio = io->bio;
+	struct bio *bio = io->orig_bio;
 	int cpu;
 	int rw = bio_data_dir(bio);
 
@@ -531,7 +531,7 @@ static void start_io_acct(struct dm_io *io)
 static void end_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
-	struct bio *bio = io->bio;
+	struct bio *bio = io->orig_bio;
 	unsigned long duration = jiffies - io->start_time;
 	int pending;
 	int rw = bio_data_dir(bio);
@@ -771,8 +771,7 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
 	/* Push-back supersedes any I/O errors */
 	if (unlikely(error)) {
 		spin_lock_irqsave(&io->endio_lock, flags);
-		if (!(io->status == BLK_STS_DM_REQUEUE &&
-				__noflush_suspending(md)))
+		if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
 			io->status = error;
 		spin_unlock_irqrestore(&io->endio_lock, flags);
 	}
@@ -784,7 +783,8 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md))
-				bio_list_add_head(&md->deferred, io->bio);
+				/* NOTE early return due to BLK_STS_DM_REQUEUE below */
+				bio_list_add_head(&md->deferred, io->orig_bio);
 			else
 				/* noflush suspend was interrupted. */
 				io->status = BLK_STS_IOERR;
@@ -792,7 +792,7 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
 		}
 
 		io_error = io->status;
-		bio = io->bio;
+		bio = io->orig_bio;
 		end_io_acct(io);
 		free_io(md, io);
 
@@ -1038,7 +1038,7 @@ void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
 {
 #ifdef CONFIG_BLK_DEV_ZONED
 	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
-	struct bio *report_bio = tio->io->bio;
+	struct bio *report_bio = tio->io->orig_bio;
 	struct blk_zone_report_hdr *hdr = NULL;
 	struct blk_zone *zone;
 	unsigned int nr_rep = 0;
@@ -1129,7 +1129,7 @@ static void __map_bio(struct dm_target_io *tio)
 	case DM_MAPIO_REMAPPED:
 		/* the bio has been remapped so dispatch it */
 		trace_block_bio_remap(clone->bi_disk->queue, clone,
-				      bio_dev(tio->io->bio), sector);
+				      bio_dev(tio->io->orig_bio), sector);
 		generic_make_request(clone);
 		break;
 	case DM_MAPIO_KILL:
@@ -1441,7 +1441,7 @@ static void __split_and_process_bio(struct mapped_device *md,
 	ci.io = alloc_io(md);
 	ci.io->status = 0;
 	atomic_set(&ci.io->io_count, 1);
-	ci.io->bio = bio;
+	ci.io->orig_bio = bio;
 	ci.io->md = md;
 	spin_lock_init(&ci.io->endio_lock);
 	ci.sector = bio->bi_iter.bi_sector;
@@ -1468,15 +1468,15 @@ static void __split_and_process_bio(struct mapped_device *md,
 				 * so that it gets handled *after* bios already submitted
 				 * have been completely processed.
 				 * We take a clone of the original to store in
-				 * ci.io->bio to be used by end_io_acct() and
+				 * ci.io->orig_bio to be used by end_io_acct() and
 				 * for dec_pending to use for completion handling.
 				 * As this path is not used for REQ_OP_ZONE_REPORT,
-				 * the usage of io->bio in dm_remap_zone_report()
+				 * the usage of io->orig_bio in dm_remap_zone_report()
 				 * won't be affected by this reassignment.
 				 */
 				struct bio *b = bio_clone_bioset(bio, GFP_NOIO,
 								 md->queue->bio_split);
-				ci.io->bio = b;
+				ci.io->orig_bio = b;
 				bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
 				bio_chain(b, bio);
 				generic_make_request(bio);

From 64f52b0e31489b46465cff2e61ab2e1f60a3b4eb Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 11 Dec 2017 23:17:47 -0500
Subject: [PATCH 29/69] dm: improve performance by moving dm_io structure to
 per-bio-data

Eliminates need for a separate mempool to allocate 'struct dm_io'
objects from.  As such, it saves an extra mempool allocation for each
original bio that DM core is issued.

This complicates the per-bio-data accessor functions by needing to
conditonally add extra padding to get to a target's per-bio-data.  But
in the end this provides a decent performance improvement for all
bio-based DM devices.

On an NVMe-loop based testbed to a ramdisk (~3100 MB/s): bio-based
DM linear performance improved by 2% (went from 2665 to 2777 MB/s).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h          |   1 +
 drivers/md/dm.c               | 171 ++++++++++++++++++++++++++--------
 include/linux/device-mapper.h |  32 +------
 3 files changed, 134 insertions(+), 70 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 6a14f945783c..8a7dc8f9e40f 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -91,6 +91,7 @@ struct mapped_device {
 	/*
 	 * io objects are allocated from here.
 	 */
+	struct bio_set *io_bs;
 	mempool_t *io_pool;
 
 	struct bio_set *bs;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3e3fbc6f708f..01d0f9c410fb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -60,9 +60,38 @@ void dm_issue_global_event(void)
 }
 
 /*
- * One of these is allocated per original bio.
+ * One of these is allocated (on-stack) per original bio.
  */
+struct clone_info {
+	struct mapped_device *md;
+	struct dm_table *map;
+	struct bio *bio;
+	struct dm_io *io;
+	sector_t sector;
+	unsigned sector_count;
+};
+
+/*
+ * One of these is allocated per clone bio.
+ */
+#define DM_TIO_MAGIC 7282014
+struct dm_target_io {
+	unsigned magic;
+	struct dm_io *io;
+	struct dm_target *ti;
+	unsigned target_bio_nr;
+	unsigned *len_ptr;
+	bool inside_dm_io;
+	struct bio clone;
+};
+
+/*
+ * One of these is allocated per original bio.
+ * It contains the first clone used for that original.
+ */
+#define DM_IO_MAGIC 5191977
 struct dm_io {
+	unsigned magic;
 	struct mapped_device *md;
 	blk_status_t status;
 	atomic_t io_count;
@@ -70,8 +99,35 @@ struct dm_io {
 	unsigned long start_time;
 	spinlock_t endio_lock;
 	struct dm_stats_aux stats_aux;
+	/* last member of dm_target_io is 'struct bio' */
+	struct dm_target_io tio;
 };
 
+void *dm_per_bio_data(struct bio *bio, size_t data_size)
+{
+	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+	if (!tio->inside_dm_io)
+		return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
+	return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
+}
+EXPORT_SYMBOL_GPL(dm_per_bio_data);
+
+struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
+{
+	struct dm_io *io = (struct dm_io *)((char *)data + data_size);
+	if (io->magic == DM_IO_MAGIC)
+		return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
+	BUG_ON(io->magic != DM_TIO_MAGIC);
+	return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
+}
+EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
+
+unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
+{
+	return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
+}
+EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
+
 #define MINOR_ALLOCED ((void *)-1)
 
 /*
@@ -95,6 +151,7 @@ static int dm_numa_node = DM_NUMA_NODE;
 struct dm_md_mempools {
 	mempool_t *io_pool;
 	struct bio_set *bs;
+	struct bio_set *io_bs;
 };
 
 struct table_device {
@@ -488,16 +545,58 @@ out:
 
 static struct dm_io *alloc_io(struct mapped_device *md)
 {
-	return mempool_alloc(md->io_pool, GFP_NOIO);
+	struct dm_io *io;
+	struct dm_target_io *tio;
+	struct bio *clone;
+
+	clone = bio_alloc_bioset(GFP_NOIO, 0, md->io_bs);
+	if (!clone)
+		return NULL;
+
+	tio = container_of(clone, struct dm_target_io, clone);
+	tio->inside_dm_io = true;
+	tio->io = NULL;
+
+	io = container_of(tio, struct dm_io, tio);
+	io->magic = DM_IO_MAGIC;
+
+	return io;
 }
 
 static void free_io(struct mapped_device *md, struct dm_io *io)
 {
-	mempool_free(io, md->io_pool);
+	bio_put(&io->tio.clone);
+}
+
+static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
+				      unsigned target_bio_nr, gfp_t gfp_mask)
+{
+	struct dm_target_io *tio;
+
+	if (!ci->io->tio.io) {
+		/* the dm_target_io embedded in ci->io is available */
+		tio = &ci->io->tio;
+	} else {
+		struct bio *clone = bio_alloc_bioset(gfp_mask, 0, ci->md->bs);
+		if (!clone)
+			return NULL;
+
+		tio = container_of(clone, struct dm_target_io, clone);
+		tio->inside_dm_io = false;
+	}
+
+	tio->magic = DM_TIO_MAGIC;
+	tio->io = ci->io;
+	tio->ti = ti;
+	tio->target_bio_nr = target_bio_nr;
+
+	return tio;
 }
 
 static void free_tio(struct dm_target_io *tio)
 {
+	if (tio->inside_dm_io)
+		return;
 	bio_put(&tio->clone);
 }
 
@@ -1110,6 +1209,7 @@ static void __map_bio(struct dm_target_io *tio)
 	int r;
 	sector_t sector;
 	struct bio *clone = &tio->clone;
+	struct dm_io *io = tio->io;
 	struct dm_target *ti = tio->ti;
 
 	clone->bi_end_io = clone_endio;
@@ -1119,7 +1219,7 @@ static void __map_bio(struct dm_target_io *tio)
 	 * anything, the target has assumed ownership of
 	 * this io.
 	 */
-	atomic_inc(&tio->io->io_count);
+	atomic_inc(&io->io_count);
 	sector = clone->bi_iter.bi_sector;
 
 	r = ti->type->map(ti, clone);
@@ -1129,16 +1229,16 @@ static void __map_bio(struct dm_target_io *tio)
 	case DM_MAPIO_REMAPPED:
 		/* the bio has been remapped so dispatch it */
 		trace_block_bio_remap(clone->bi_disk->queue, clone,
-				      bio_dev(tio->io->orig_bio), sector);
+				      bio_dev(io->orig_bio), sector);
 		generic_make_request(clone);
 		break;
 	case DM_MAPIO_KILL:
-		dec_pending(tio->io, BLK_STS_IOERR);
 		free_tio(tio);
+		dec_pending(io, BLK_STS_IOERR);
 		break;
 	case DM_MAPIO_REQUEUE:
-		dec_pending(tio->io, BLK_STS_DM_REQUEUE);
 		free_tio(tio);
+		dec_pending(io, BLK_STS_DM_REQUEUE);
 		break;
 	default:
 		DMWARN("unimplemented target map return value: %d", r);
@@ -1146,15 +1246,6 @@ static void __map_bio(struct dm_target_io *tio)
 	}
 }
 
-struct clone_info {
-	struct mapped_device *md;
-	struct dm_table *map;
-	struct bio *bio;
-	struct dm_io *io;
-	sector_t sector;
-	unsigned sector_count;
-};
-
 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
 {
 	bio->bi_iter.bi_sector = sector;
@@ -1197,24 +1288,6 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 	return 0;
 }
 
-static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
-				      unsigned target_bio_nr, gfp_t gfp_mask)
-{
-	struct dm_target_io *tio;
-	struct bio *clone;
-
-	clone = bio_alloc_bioset(gfp_mask, 0, ci->md->bs);
-	if (!clone)
-		return NULL;
-
-	tio = container_of(clone, struct dm_target_io, clone);
-	tio->io = ci->io;
-	tio->ti = ti;
-	tio->target_bio_nr = target_bio_nr;
-
-	return tio;
-}
-
 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 				struct dm_target *ti, unsigned num_bios)
 {
@@ -1628,6 +1701,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
 	mempool_destroy(md->io_pool);
 	if (md->bs)
 		bioset_free(md->bs);
+	if (md->io_bs)
+		bioset_free(md->io_bs);
 
 	if (md->dax_dev) {
 		kill_dax(md->dax_dev);
@@ -1793,15 +1868,19 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
 	if (dm_table_bio_based(t)) {
-		/* The md may already have mempools that need changing. */
+		/*
+		 * The md may already have mempools that need changing.
+		 * If so, reload bioset because front_pad may have changed
+		 * because a different table was loaded.
+		 */
 		if (md->bs) {
-			/*
-			 * Reload bioset because front_pad may have changed
-			 * because a different table was loaded.
-			 */
 			bioset_free(md->bs);
 			md->bs = NULL;
 		}
+		if (md->io_bs) {
+			bioset_free(md->io_bs);
+			md->io_bs = NULL;
+		}
 		if (md->io_pool) {
 			/*
 			 * Reload io_pool because pool_size may have changed
@@ -1823,12 +1902,14 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
-	BUG_ON(!p || md->io_pool || md->bs);
+	BUG_ON(!p || md->io_pool || md->bs || md->io_bs);
 
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
+	md->io_bs = p->io_bs;
+	p->io_bs = NULL;
 out:
 	/* mempool bind completed, no longer need any mempools in the table */
 	dm_table_free_md_mempools(t);
@@ -2719,7 +2800,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 {
 	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
 	unsigned int pool_size = 0;
-	unsigned int front_pad;
+	unsigned int front_pad, io_front_pad;
 
 	if (!pools)
 		return NULL;
@@ -2729,6 +2810,12 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 	case DM_TYPE_DAX_BIO_BASED:
 		pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
 		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+		io_front_pad = roundup(front_pad,  __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
+		pools->io_bs = bioset_create(pool_size, io_front_pad, 0);
+		if (!pools->io_bs)
+			goto out;
+		if (integrity && bioset_integrity_create(pools->io_bs, pool_size))
+			goto out;
 		pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
 		if (!pools->io_pool)
 			goto out;
@@ -2767,6 +2854,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 
 	if (pools->bs)
 		bioset_free(pools->bs);
+	if (pools->io_bs)
+		bioset_free(pools->io_bs);
 
 	kfree(pools);
 }
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 5a68b366e664..0e518d2ee280 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -314,35 +314,9 @@ struct dm_target_callbacks {
 	int (*congested_fn) (struct dm_target_callbacks *, int);
 };
 
-/*
- * For bio-based dm.
- * One of these is allocated for each bio.
- * This structure shouldn't be touched directly by target drivers.
- * It is here so that we can inline dm_per_bio_data and
- * dm_bio_from_per_bio_data
- */
-struct dm_target_io {
-	struct dm_io *io;
-	struct dm_target *ti;
-	unsigned target_bio_nr;
-	unsigned *len_ptr;
-	struct bio clone;
-};
-
-static inline void *dm_per_bio_data(struct bio *bio, size_t data_size)
-{
-	return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
-}
-
-static inline struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
-{
-	return (struct bio *)((char *)data + data_size + offsetof(struct dm_target_io, clone));
-}
-
-static inline unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
-{
-	return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
-}
+void *dm_per_bio_data(struct bio *bio, size_t data_size);
+struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size);
+unsigned dm_bio_get_target_bio_nr(const struct bio *bio);
 
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);

From dde1e1ec4c08c3b7a4cee6cdad53948680d9fe39 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 11 Dec 2017 23:28:13 -0500
Subject: [PATCH 30/69] dm: remove now unused bio-based io_pool and _io_cache

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h |  2 --
 drivers/md/dm.c      | 30 ++----------------------------
 2 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 8a7dc8f9e40f..124ffa2d6b9a 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -92,8 +92,6 @@ struct mapped_device {
 	 * io objects are allocated from here.
 	 */
 	struct bio_set *io_bs;
-	mempool_t *io_pool;
-
 	struct bio_set *bs;
 
 	/*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 01d0f9c410fb..4284ad8d9892 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -149,7 +149,6 @@ static int dm_numa_node = DM_NUMA_NODE;
  * For mempools pre-allocation at the table loading time.
  */
 struct dm_md_mempools {
-	mempool_t *io_pool;
 	struct bio_set *bs;
 	struct bio_set *io_bs;
 };
@@ -160,7 +159,6 @@ struct table_device {
 	struct dm_dev dm_dev;
 };
 
-static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 static struct kmem_cache *_rq_cache;
 
@@ -227,14 +225,9 @@ static int __init local_init(void)
 {
 	int r = -ENOMEM;
 
-	/* allocate a slab for the dm_ios */
-	_io_cache = KMEM_CACHE(dm_io, 0);
-	if (!_io_cache)
-		return r;
-
 	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 	if (!_rq_tio_cache)
-		goto out_free_io_cache;
+		return r;
 
 	_rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
 				      __alignof__(struct request), 0, NULL);
@@ -269,8 +262,6 @@ out_free_rq_cache:
 	kmem_cache_destroy(_rq_cache);
 out_free_rq_tio_cache:
 	kmem_cache_destroy(_rq_tio_cache);
-out_free_io_cache:
-	kmem_cache_destroy(_io_cache);
 
 	return r;
 }
@@ -282,7 +273,6 @@ static void local_exit(void)
 
 	kmem_cache_destroy(_rq_cache);
 	kmem_cache_destroy(_rq_tio_cache);
-	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
 	dm_uevent_exit();
 
@@ -1698,7 +1688,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
 		destroy_workqueue(md->wq);
 	if (md->kworker_task)
 		kthread_stop(md->kworker_task);
-	mempool_destroy(md->io_pool);
 	if (md->bs)
 		bioset_free(md->bs);
 	if (md->io_bs)
@@ -1881,14 +1870,6 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 			bioset_free(md->io_bs);
 			md->io_bs = NULL;
 		}
-		if (md->io_pool) {
-			/*
-			 * Reload io_pool because pool_size may have changed
-			 * because a different table was loaded.
-			 */
-			mempool_destroy(md->io_pool);
-			md->io_pool = NULL;
-		}
 
 	} else if (md->bs) {
 		/*
@@ -1902,10 +1883,8 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
-	BUG_ON(!p || md->io_pool || md->bs || md->io_bs);
+	BUG_ON(!p || md->bs || md->io_bs);
 
-	md->io_pool = p->io_pool;
-	p->io_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
 	md->io_bs = p->io_bs;
@@ -2816,9 +2795,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 			goto out;
 		if (integrity && bioset_integrity_create(pools->io_bs, pool_size))
 			goto out;
-		pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
-		if (!pools->io_pool)
-			goto out;
 		break;
 	case DM_TYPE_REQUEST_BASED:
 	case DM_TYPE_MQ_REQUEST_BASED:
@@ -2850,8 +2826,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 	if (!pools)
 		return;
 
-	mempool_destroy(pools->io_pool);
-
 	if (pools->bs)
 		bioset_free(pools->bs);
 	if (pools->io_bs)

From bc02cdbe53eadcef75221c3f1f48cdcbdb9cb6ef Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 14 Dec 2017 16:30:42 -0500
Subject: [PATCH 31/69] dm: remove redundant mapped_device member from
 clone_info structure

'struct dm_io' already has the same pointer.  So update all accesses
from ci->md to ci->io->md.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4284ad8d9892..e4213c4c7c9b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -63,7 +63,6 @@ void dm_issue_global_event(void)
  * One of these is allocated (on-stack) per original bio.
  */
 struct clone_info {
-	struct mapped_device *md;
 	struct dm_table *map;
 	struct bio *bio;
 	struct dm_io *io;
@@ -567,7 +566,7 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
 		/* the dm_target_io embedded in ci->io is available */
 		tio = &ci->io->tio;
 	} else {
-		struct bio *clone = bio_alloc_bioset(gfp_mask, 0, ci->md->bs);
+		struct bio *clone = bio_alloc_bioset(gfp_mask, 0, ci->io->md->bs);
 		if (!clone)
 			return NULL;
 
@@ -1298,7 +1297,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 		struct bio *bio;
 
 		if (try)
-			mutex_lock(&ci->md->table_devices_lock);
+			mutex_lock(&ci->io->md->table_devices_lock);
 		for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
 			tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
 			if (!tio)
@@ -1307,7 +1306,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 			bio_list_add(blist, &tio->clone);
 		}
 		if (try)
-			mutex_unlock(&ci->md->table_devices_lock);
+			mutex_unlock(&ci->io->md->table_devices_lock);
 		if (bio_nr == num_bios)
 			return;
 
@@ -1500,7 +1499,6 @@ static void __split_and_process_bio(struct mapped_device *md,
 	}
 
 	ci.map = map;
-	ci.md = md;
 	ci.io = alloc_io(md);
 	ci.io->status = 0;
 	atomic_set(&ci.io->io_count, 1);
@@ -1512,7 +1510,7 @@ static void __split_and_process_bio(struct mapped_device *md,
 	start_io_acct(ci.io);
 
 	if (bio->bi_opf & REQ_PREFLUSH) {
-		ci.bio = &ci.md->flush_bio;
+		ci.bio = &ci.io->md->flush_bio;
 		ci.sector_count = 0;
 		error = __send_empty_flush(&ci);
 		/* dec_pending submits any data associated with flush */

From f3986374f94951b0fec6980e5b2dd621c51b215c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sun, 17 Dec 2017 11:56:48 -0500
Subject: [PATCH 32/69] dm: simplify start of block stats accounting for
 bio-based

No apparent need to generic_start_io_acct() until before the IO is ready
for submission.  start_io_acct() is the proper place to do this
accounting -- it is also where DM accounts for pending IO and, if
enabled, starts dm-stats accounting.

Replace start_io_acct()'s part_round_stats() with generic_start_io_acct().
This eliminates needing to take part_stat_lock() multiple times when
starting an IO on bio-based devices.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e4213c4c7c9b..cbb4ae5051fc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -599,16 +599,14 @@ static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->orig_bio;
-	int cpu;
 	int rw = bio_data_dir(bio);
 
 	io->start_time = jiffies;
 
-	cpu = part_stat_lock();
-	part_round_stats(md->queue, cpu, &dm_disk(md)->part0);
-	part_stat_unlock();
+	generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0);
+
 	atomic_set(&dm_disk(md)->part0.in_flight[rw],
-		atomic_inc_return(&md->pending[rw]));
+		   atomic_inc_return(&md->pending[rw]));
 
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@ -1556,15 +1554,12 @@ static void __split_and_process_bio(struct mapped_device *md,
  */
 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
 {
-	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
 	int srcu_idx;
 	struct dm_table *map;
 
 	map = dm_get_live_table(md, &srcu_idx);
 
-	generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0);
-
 	/* if we're suspended, we have to queue this io for later */
 	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
 		dm_put_live_table(md, srcu_idx);

From 22c11858e8002592c59ebb762e4e42dc634bf84f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 4 Dec 2017 21:07:37 -0500
Subject: [PATCH 33/69] dm: introduce DM_TYPE_NVME_BIO_BASED

If dm_table_determine_type() establishes DM_TYPE_NVME_BIO_BASED then
all devices in the DM table do not support partial completions.  Also,
the table has a single immutable target that doesn't require DM core to
split bios.

This will enable adding NVMe optimizations to bio-based DM.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c         | 54 +++++++++++++++++++++++++++++++----
 drivers/md/dm.c               |  2 ++
 include/linux/device-mapper.h |  1 +
 3 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 504e79bc3a55..ad4ac294dd57 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -866,7 +866,8 @@ EXPORT_SYMBOL(dm_consume_args);
 static bool __table_type_bio_based(enum dm_queue_mode table_type)
 {
 	return (table_type == DM_TYPE_BIO_BASED ||
-		table_type == DM_TYPE_DAX_BIO_BASED);
+		table_type == DM_TYPE_DAX_BIO_BASED ||
+		table_type == DM_TYPE_NVME_BIO_BASED);
 }
 
 static bool __table_type_request_based(enum dm_queue_mode table_type)
@@ -909,6 +910,8 @@ static bool dm_table_supports_dax(struct dm_table *t)
 	return true;
 }
 
+static bool dm_table_does_not_support_partial_completion(struct dm_table *t);
+
 static int dm_table_determine_type(struct dm_table *t)
 {
 	unsigned i;
@@ -923,6 +926,14 @@ static int dm_table_determine_type(struct dm_table *t)
 		/* target already set the table's type */
 		if (t->type == DM_TYPE_BIO_BASED)
 			return 0;
+		else if (t->type == DM_TYPE_NVME_BIO_BASED) {
+			if (!dm_table_does_not_support_partial_completion(t)) {
+				DMERR("nvme bio-based is only possible with devices"
+				      " that don't support partial completion");
+				return -EINVAL;
+			}
+			/* Fallthru, also verify all devices are blk-mq */
+		}
 		BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED);
 		goto verify_rq_based;
 	}
@@ -937,8 +948,8 @@ static int dm_table_determine_type(struct dm_table *t)
 			bio_based = 1;
 
 		if (bio_based && request_based) {
-			DMWARN("Inconsistent table: different target types"
-			       " can't be mixed up");
+			DMERR("Inconsistent table: different target types"
+			      " can't be mixed up");
 			return -EINVAL;
 		}
 	}
@@ -959,8 +970,14 @@ static int dm_table_determine_type(struct dm_table *t)
 		/* We must use this table as bio-based */
 		t->type = DM_TYPE_BIO_BASED;
 		if (dm_table_supports_dax(t) ||
-		    (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED))
+		    (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
 			t->type = DM_TYPE_DAX_BIO_BASED;
+		} else if ((dm_table_get_immutable_target(t) &&
+			    dm_table_does_not_support_partial_completion(t)) ||
+			   (list_empty(devices) && live_md_type == DM_TYPE_NVME_BIO_BASED)) {
+			t->type = DM_TYPE_NVME_BIO_BASED;
+			goto verify_rq_based;
+		}
 		return 0;
 	}
 
@@ -980,7 +997,8 @@ verify_rq_based:
 	 * (e.g. request completion process for partial completion.)
 	 */
 	if (t->num_targets > 1) {
-		DMWARN("Request-based dm doesn't support multiple targets yet");
+		DMERR("%s DM doesn't support multiple targets",
+		      t->type == DM_TYPE_NVME_BIO_BASED ? "nvme bio-based" : "request-based");
 		return -EINVAL;
 	}
 
@@ -997,6 +1015,15 @@ verify_rq_based:
 		return 0;
 	}
 
+	tgt = dm_table_get_immutable_target(t);
+	if (!tgt) {
+		DMERR("table load rejected: immutable target is required");
+		return -EINVAL;
+	} else if (tgt->max_io_len) {
+		DMERR("table load rejected: immutable target that splits IO is not supported");
+		return -EINVAL;
+	}
+
 	/* Non-request-stackable devices can't be used for request-based dm */
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
@@ -1018,7 +1045,8 @@ verify_rq_based:
 	}
 	t->all_blk_mq = mq_count > 0;
 
-	if (t->type == DM_TYPE_MQ_REQUEST_BASED && !t->all_blk_mq) {
+	if (!t->all_blk_mq &&
+	    (t->type == DM_TYPE_MQ_REQUEST_BASED || t->type == DM_TYPE_NVME_BIO_BASED)) {
 		DMERR("table load rejected: all devices are not blk-mq request-stackable");
 		return -EINVAL;
 	}
@@ -1708,6 +1736,20 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 	return true;
 }
 
+static int device_no_partial_completion(struct dm_target *ti, struct dm_dev *dev,
+					sector_t start, sector_t len, void *data)
+{
+	char b[BDEVNAME_SIZE];
+
+	/* For now, NVMe devices are the only devices of this class */
+	return (strncmp(bdevname(dev->bdev, b), "nvme", 3) == 0);
+}
+
+static bool dm_table_does_not_support_partial_completion(struct dm_table *t)
+{
+	return dm_table_all_devices_attribute(t, device_no_partial_completion);
+}
+
 static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
 					 sector_t start, sector_t len, void *data)
 {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index cbb4ae5051fc..a1bd7a6ff522 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2073,6 +2073,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		break;
 	case DM_TYPE_BIO_BASED:
 	case DM_TYPE_DAX_BIO_BASED:
+	case DM_TYPE_NVME_BIO_BASED:
 		dm_init_normal_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
 		break;
@@ -2780,6 +2781,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
 	case DM_TYPE_DAX_BIO_BASED:
+	case DM_TYPE_NVME_BIO_BASED:
 		pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
 		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 		io_front_pad = roundup(front_pad,  __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 0e518d2ee280..41ec228b02a6 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -28,6 +28,7 @@ enum dm_queue_mode {
 	DM_TYPE_REQUEST_BASED	 = 2,
 	DM_TYPE_MQ_REQUEST_BASED = 3,
 	DM_TYPE_DAX_BIO_BASED	 = 4,
+	DM_TYPE_NVME_BIO_BASED	 = 5,
 };
 
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;

From 978e51ba38e00e9da09b3ef9ed8c94af7b55a1eb Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sat, 9 Dec 2017 15:16:42 -0500
Subject: [PATCH 34/69] dm: optimize bio-based NVMe IO submission

Upper level bio-based drivers that stack immediately ontop of NVMe can
leverage direct_make_request().  In addition DM's NVMe bio-based
will initially only ever have one NVMe device that it submits IO to at a
time.  There is no splitting needed.  Enhance DM core so that
DM_TYPE_NVME_BIO_BASED's IO submission takes advantage of both of these
characteristics.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 154 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 120 insertions(+), 34 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a1bd7a6ff522..73d7f316ac1d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -532,7 +532,9 @@ out:
 	return r;
 }
 
-static struct dm_io *alloc_io(struct mapped_device *md)
+static void start_io_acct(struct dm_io *io);
+
+static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 {
 	struct dm_io *io;
 	struct dm_target_io *tio;
@@ -548,6 +550,13 @@ static struct dm_io *alloc_io(struct mapped_device *md)
 
 	io = container_of(tio, struct dm_io, tio);
 	io->magic = DM_IO_MAGIC;
+	io->status = 0;
+	atomic_set(&io->io_count, 1);
+	io->orig_bio = bio;
+	io->md = md;
+	spin_lock_init(&io->endio_lock);
+
+	start_io_acct(io);
 
 	return io;
 }
@@ -924,7 +933,7 @@ static void clone_endio(struct bio *bio)
 	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
 
-	if (unlikely(error == BLK_STS_TARGET)) {
+	if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
 		if (bio_op(bio) == REQ_OP_WRITE_SAME &&
 		    !bio->bi_disk->queue->limits.max_write_same_sectors)
 			disable_write_same(md);
@@ -1191,13 +1200,15 @@ void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
 }
 EXPORT_SYMBOL_GPL(dm_remap_zone_report);
 
-static void __map_bio(struct dm_target_io *tio)
+static blk_qc_t __map_bio(struct dm_target_io *tio)
 {
 	int r;
 	sector_t sector;
 	struct bio *clone = &tio->clone;
 	struct dm_io *io = tio->io;
+	struct mapped_device *md = io->md;
 	struct dm_target *ti = tio->ti;
+	blk_qc_t ret = BLK_QC_T_NONE;
 
 	clone->bi_end_io = clone_endio;
 
@@ -1217,7 +1228,10 @@ static void __map_bio(struct dm_target_io *tio)
 		/* the bio has been remapped so dispatch it */
 		trace_block_bio_remap(clone->bi_disk->queue, clone,
 				      bio_dev(io->orig_bio), sector);
-		generic_make_request(clone);
+		if (md->type == DM_TYPE_NVME_BIO_BASED)
+			ret = direct_make_request(clone);
+		else
+			ret = generic_make_request(clone);
 		break;
 	case DM_MAPIO_KILL:
 		free_tio(tio);
@@ -1231,6 +1245,8 @@ static void __map_bio(struct dm_target_io *tio)
 		DMWARN("unimplemented target map return value: %d", r);
 		BUG();
 	}
+
+	return ret;
 }
 
 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
@@ -1315,8 +1331,8 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 	}
 }
 
-static void __clone_and_map_simple_bio(struct clone_info *ci,
-				       struct dm_target_io *tio, unsigned *len)
+static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
+					   struct dm_target_io *tio, unsigned *len)
 {
 	struct bio *clone = &tio->clone;
 
@@ -1326,7 +1342,7 @@ static void __clone_and_map_simple_bio(struct clone_info *ci,
 	if (len)
 		bio_setup_sector(clone, ci->sector, *len);
 
-	__map_bio(tio);
+	return __map_bio(tio);
 }
 
 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
@@ -1340,7 +1356,7 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
 
 	while ((bio = bio_list_pop(&blist))) {
 		tio = container_of(bio, struct dm_target_io, clone);
-		__clone_and_map_simple_bio(ci, tio, len);
+		(void) __clone_and_map_simple_bio(ci, tio, len);
 	}
 }
 
@@ -1370,7 +1386,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 		free_tio(tio);
 		return r;
 	}
-	__map_bio(tio);
+	(void) __map_bio(tio);
 
 	return 0;
 }
@@ -1482,30 +1498,30 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 	return 0;
 }
 
+static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
+			    struct dm_table *map, struct bio *bio)
+{
+	ci->map = map;
+	ci->io = alloc_io(md, bio);
+	ci->sector = bio->bi_iter.bi_sector;
+}
+
 /*
  * Entry point to split a bio into clones and submit them to the targets.
  */
-static void __split_and_process_bio(struct mapped_device *md,
-				    struct dm_table *map, struct bio *bio)
+static blk_qc_t __split_and_process_bio(struct mapped_device *md,
+					struct dm_table *map, struct bio *bio)
 {
 	struct clone_info ci;
+	blk_qc_t ret = BLK_QC_T_NONE;
 	int error = 0;
 
 	if (unlikely(!map)) {
 		bio_io_error(bio);
-		return;
+		return ret;
 	}
 
-	ci.map = map;
-	ci.io = alloc_io(md);
-	ci.io->status = 0;
-	atomic_set(&ci.io->io_count, 1);
-	ci.io->orig_bio = bio;
-	ci.io->md = md;
-	spin_lock_init(&ci.io->endio_lock);
-	ci.sector = bio->bi_iter.bi_sector;
-
-	start_io_acct(ci.io);
+	init_clone_info(&ci, md, map, bio);
 
 	if (bio->bi_opf & REQ_PREFLUSH) {
 		ci.bio = &ci.io->md->flush_bio;
@@ -1538,7 +1554,7 @@ static void __split_and_process_bio(struct mapped_device *md,
 				ci.io->orig_bio = b;
 				bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
 				bio_chain(b, bio);
-				generic_make_request(bio);
+				ret = generic_make_request(bio);
 				break;
 			}
 		}
@@ -1546,15 +1562,63 @@ static void __split_and_process_bio(struct mapped_device *md,
 
 	/* drop the extra reference count */
 	dec_pending(ci.io, errno_to_blk_status(error));
+	return ret;
 }
 
 /*
- * The request function that remaps the bio to one target and
- * splits off any remainder.
+ * Optimized variant of __split_and_process_bio that leverages the
+ * fact that targets that use it do _not_ have a need to split bios.
  */
-static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t __process_bio(struct mapped_device *md,
+			      struct dm_table *map, struct bio *bio)
+{
+	struct clone_info ci;
+	blk_qc_t ret = BLK_QC_T_NONE;
+	int error = 0;
+
+	if (unlikely(!map)) {
+		bio_io_error(bio);
+		return ret;
+	}
+
+	init_clone_info(&ci, md, map, bio);
+
+	if (bio->bi_opf & REQ_PREFLUSH) {
+		ci.bio = &ci.io->md->flush_bio;
+		ci.sector_count = 0;
+		error = __send_empty_flush(&ci);
+		/* dec_pending submits any data associated with flush */
+	} else {
+		struct dm_target *ti = md->immutable_target;
+		struct dm_target_io *tio;
+
+		/*
+		 * Defend against IO still getting in during teardown
+		 * - as was seen for a time with nvme-fcloop
+		 */
+		if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
+			error = -EIO;
+			goto out;
+		}
+
+		tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
+		ci.bio = bio;
+		ci.sector_count = bio_sectors(bio);
+		ret = __clone_and_map_simple_bio(&ci, tio, NULL);
+	}
+out:
+	/* drop the extra reference count */
+	dec_pending(ci.io, errno_to_blk_status(error));
+	return ret;
+}
+
+typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
+
+static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
+				  process_bio_fn process_bio)
 {
 	struct mapped_device *md = q->queuedata;
+	blk_qc_t ret = BLK_QC_T_NONE;
 	int srcu_idx;
 	struct dm_table *map;
 
@@ -1568,12 +1632,27 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
 			queue_io(md, bio);
 		else
 			bio_io_error(bio);
-		return BLK_QC_T_NONE;
+		return ret;
 	}
 
-	__split_and_process_bio(md, map, bio);
+	ret = process_bio(md, map, bio);
+
 	dm_put_live_table(md, srcu_idx);
-	return BLK_QC_T_NONE;
+	return ret;
+}
+
+/*
+ * The request function that remaps the bio to one target and
+ * splits off any remainder.
+ */
+static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
+{
+	return __dm_make_request(q, bio, __split_and_process_bio);
+}
+
+static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
+{
+	return __dm_make_request(q, bio, __process_bio);
 }
 
 static int dm_any_congested(void *congested_data, int bdi_bits)
@@ -1927,6 +2006,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 {
 	struct dm_table *old_map;
 	struct request_queue *q = md->queue;
+	bool request_based = dm_table_request_based(t);
 	sector_t size;
 
 	lockdep_assert_held(&md->suspend_lock);
@@ -1950,12 +2030,15 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	 * This must be done before setting the queue restrictions,
 	 * because request-based dm may be run just after the setting.
 	 */
-	if (dm_table_request_based(t)) {
+	if (request_based)
 		dm_stop_queue(q);
+
+	if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
 		/*
-		 * Leverage the fact that request-based DM targets are
-		 * immutable singletons and establish md->immutable_target
-		 * - used to optimize both dm_request_fn and dm_mq_queue_rq
+		 * Leverage the fact that request-based DM targets and
+		 * NVMe bio based targets are immutable singletons
+		 * - used to optimize both dm_request_fn and dm_mq_queue_rq;
+		 *   and __process_bio.
 		 */
 		md->immutable_target = dm_table_get_immutable_target(t);
 	}
@@ -2073,10 +2156,13 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		break;
 	case DM_TYPE_BIO_BASED:
 	case DM_TYPE_DAX_BIO_BASED:
-	case DM_TYPE_NVME_BIO_BASED:
 		dm_init_normal_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
 		break;
+	case DM_TYPE_NVME_BIO_BASED:
+		dm_init_normal_md_queue(md);
+		blk_queue_make_request(md->queue, dm_make_request_nvme);
+		break;
 	case DM_TYPE_NONE:
 		WARN_ON_ONCE(true);
 		break;

From 63f6e6fd05624a1c915ae95dfb81264f659914d8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 5 Dec 2017 14:10:33 -0500
Subject: [PATCH 35/69] dm mpath: remove unused param from
 multipath_init_per_bio_data()

'struct dm_bio_details *' isn't ever needed.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f7810cc869ac..8ed72275026d 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -273,8 +273,7 @@ static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio)
 	return bio_details;
 }
 
-static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p,
-					struct dm_bio_details **bio_details_p)
+static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p)
 {
 	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
 	struct dm_bio_details *bio_details = get_bio_details_from_bio(bio);
@@ -285,8 +284,6 @@ static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mp
 
 	if (mpio_p)
 		*mpio_p = mpio;
-	if (bio_details_p)
-		*bio_details_p = bio_details;
 }
 
 /*-----------------------------------------------
@@ -610,8 +607,7 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
 	struct multipath *m = ti->private;
 	struct dm_mpath_io *mpio = NULL;
 
-	multipath_init_per_bio_data(bio, &mpio, NULL);
-
+	multipath_init_per_bio_data(bio, &mpio);
 	return __multipath_map_bio(m, bio, mpio);
 }
 

From d0442f8039ee54716dd3f3100cfd8e11d9a2486c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 11 Dec 2017 15:58:41 -0500
Subject: [PATCH 36/69] dm mpath: remove unnecessary memset() calls for
 per-io-data

All underlying members are initialized directly so the memset() calls
are not needed.  Also, initialize mpio->nr_bytes from the start since it
never changes.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 8ed72275026d..a3acbfb638be 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -278,12 +278,11 @@ static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mp
 	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
 	struct dm_bio_details *bio_details = get_bio_details_from_bio(bio);
 
-	memset(mpio, 0, sizeof(*mpio));
-	memset(bio_details, 0, sizeof(*bio_details));
-	dm_bio_record(bio_details, bio);
+	mpio->nr_bytes = bio->bi_iter.bi_size;
+	mpio->pgpath = NULL;
+	*mpio_p = mpio;
 
-	if (mpio_p)
-		*mpio_p = mpio;
+	dm_bio_record(bio_details, bio);
 }
 
 /*-----------------------------------------------
@@ -518,7 +517,6 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 		return DM_MAPIO_REQUEUE;
 	}
 
-	memset(mpio, 0, sizeof(*mpio));
 	mpio->pgpath = pgpath;
 	mpio->nr_bytes = nr_bytes;
 
@@ -556,7 +554,6 @@ static void multipath_release_clone(struct request *clone)
  */
 static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio)
 {
-	size_t nr_bytes = bio->bi_iter.bi_size;
 	struct pgpath *pgpath;
 	unsigned long flags;
 	bool queue_io;
@@ -565,7 +562,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	pgpath = READ_ONCE(m->current_pgpath);
 	queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
 	if (!pgpath || !queue_io)
-		pgpath = choose_pgpath(m, nr_bytes);
+		pgpath = choose_pgpath(m, mpio->nr_bytes);
 
 	if ((pgpath && queue_io) ||
 	    (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
@@ -589,7 +586,6 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	}
 
 	mpio->pgpath = pgpath;
-	mpio->nr_bytes = nr_bytes;
 
 	bio->bi_status = 0;
 	bio_set_dev(bio, pgpath->path.dev->bdev);
@@ -598,7 +594,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	if (pgpath->pg->ps.type->start_io)
 		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
 					      &pgpath->path,
-					      nr_bytes);
+					      mpio->nr_bytes);
 	return DM_MAPIO_REMAPPED;
 }
 

From d07a241d4f9aaab61b4a2ce7ec8053ad429c4232 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 11 Dec 2017 16:08:54 -0500
Subject: [PATCH 37/69] dm mpath: optimize retrieval of bio_details from
 per-bio-data

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index a3acbfb638be..40d721df11d3 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -264,19 +264,17 @@ static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
 	return dm_per_bio_data(bio, multipath_per_bio_data_size());
 }
 
-static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio)
+static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio)
 {
 	/* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
-	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
 	void *bio_details = mpio + 1;
-
 	return bio_details;
 }
 
 static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p)
 {
 	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
-	struct dm_bio_details *bio_details = get_bio_details_from_bio(bio);
+	struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio);
 
 	mpio->nr_bytes = bio->bi_iter.bi_size;
 	mpio->pgpath = NULL;
@@ -1554,7 +1552,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
 	}
 
 	/* Queue for the daemon to resubmit */
-	dm_bio_restore(get_bio_details_from_bio(clone), clone);
+	dm_bio_restore(get_bio_details_from_mpio(mpio), clone);
 
 	spin_lock_irqsave(&m->lock, flags);
 	bio_list_add(&m->queued_bios, clone);

From 1836df0891423dcf2b68771e04b28e2208ee95f2 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 6 Dec 2017 16:08:14 -0500
Subject: [PATCH 38/69] dm mpath: move dm_bio_restore out of endio method

Moving the dm_bio_restore() to process_queued_bios() avoids doing that
work in multipath_end_io_bio().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 40d721df11d3..45e9044d3bb5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -639,7 +639,9 @@ static void process_queued_bios(struct work_struct *work)
 
 	blk_start_plug(&plug);
 	while ((bio = bio_list_pop(&bios))) {
-		r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio));
+		struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
+		dm_bio_restore(get_bio_details_from_mpio(mpio), bio);
+		r = __multipath_map_bio(m, bio, mpio);
 		switch (r) {
 		case DM_MAPIO_KILL:
 			bio->bi_status = BLK_STS_IOERR;
@@ -1551,9 +1553,6 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
 		goto done;
 	}
 
-	/* Queue for the daemon to resubmit */
-	dm_bio_restore(get_bio_details_from_mpio(mpio), clone);
-
 	spin_lock_irqsave(&m->lock, flags);
 	bio_list_add(&m->queued_bios, clone);
 	spin_unlock_irqrestore(&m->lock, flags);

From cd025384455715525a296e54999349e540850301 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 5 Dec 2017 16:02:21 -0500
Subject: [PATCH 39/69] dm mpath: implement NVMe bio-based support

This DM multipath NVMe bio-based support requires CONFIG_NVME_MULTIPATH
to not be set.  In the future hopefully NVMe multipath and DM multipath
can co-exist more seemlessly.  But as is, if CONFIG_NVME_MULTIPATH=Y
then all the individal NVMe paths will remain hidden to upper layers and
as such DM multipath will not be able to manage them.

Though NVMe's native multipathing doesn't multipath namespaces across
subsystems; so technically a user _could_ use CONFIG_NVME_MULTIPATH=Y
and also use DM multipath to multipath across subsystems.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 45e9044d3bb5..d3813b1e74e2 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -221,13 +221,18 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
 			m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
 		else
 			m->queue_mode = DM_TYPE_REQUEST_BASED;
-	} else if (m->queue_mode == DM_TYPE_BIO_BASED) {
+
+	} else if (m->queue_mode == DM_TYPE_BIO_BASED ||
+		   m->queue_mode == DM_TYPE_NVME_BIO_BASED) {
 		INIT_WORK(&m->process_queued_bios, process_queued_bios);
-		/*
-		 * bio-based doesn't support any direct scsi_dh management;
-		 * it just discovers if a scsi_dh is attached.
-		 */
-		set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
+
+		if (m->queue_mode == DM_TYPE_BIO_BASED) {
+			/*
+			 * bio-based doesn't support any direct scsi_dh management;
+			 * it just discovers if a scsi_dh is attached.
+			 */
+			set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
+		}
 	}
 
 	dm_table_set_type(ti->table, m->queue_mode);
@@ -609,7 +614,8 @@ static void process_queued_io_list(struct multipath *m)
 {
 	if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
 		dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
-	else if (m->queue_mode == DM_TYPE_BIO_BASED)
+	else if (m->queue_mode == DM_TYPE_BIO_BASED ||
+		 m->queue_mode == DM_TYPE_NVME_BIO_BASED)
 		queue_work(kmultipathd, &m->process_queued_bios);
 }
 
@@ -925,7 +931,8 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
 	if (!hw_argc)
 		return 0;
 
-	if (m->queue_mode == DM_TYPE_BIO_BASED) {
+	if (m->queue_mode == DM_TYPE_BIO_BASED ||
+	    m->queue_mode == DM_TYPE_NVME_BIO_BASED) {
 		dm_consume_args(as, hw_argc);
 		DMERR("bio-based multipath doesn't allow hardware handler args");
 		return 0;
@@ -1014,6 +1021,8 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 
 			if (!strcasecmp(queue_mode_name, "bio"))
 				m->queue_mode = DM_TYPE_BIO_BASED;
+			else if (!strcasecmp(queue_mode_name, "nvme"))
+				m->queue_mode = DM_TYPE_NVME_BIO_BASED;
 			else if (!strcasecmp(queue_mode_name, "rq"))
 				m->queue_mode = DM_TYPE_REQUEST_BASED;
 			else if (!strcasecmp(queue_mode_name, "mq"))
@@ -1114,7 +1123,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
 	ti->num_write_zeroes_bios = 1;
-	if (m->queue_mode == DM_TYPE_BIO_BASED)
+	if (m->queue_mode == DM_TYPE_BIO_BASED || m->queue_mode == DM_TYPE_NVME_BIO_BASED)
 		ti->per_io_data_size = multipath_per_bio_data_size();
 	else
 		ti->per_io_data_size = sizeof(struct dm_mpath_io);
@@ -1660,6 +1669,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 			case DM_TYPE_BIO_BASED:
 				DMEMIT("queue_mode bio ");
 				break;
+			case DM_TYPE_NVME_BIO_BASED:
+				DMEMIT("queue_mode nvme ");
+				break;
 			case DM_TYPE_MQ_REQUEST_BASED:
 				DMEMIT("queue_mode mq ");
 				break;

From 848b8aefd44df99b3e38a872acb8d54d3530bebf Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sun, 10 Dec 2017 15:37:21 -0500
Subject: [PATCH 40/69] dm mpath: optimize NVMe bio-based support

All code that deals with pg_init is not used with bio-based NVMe mode.
This includes skipping initialization of pg_init related variables.

Also, pg_init related members on 'struct multipath' have been grouped
together.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 171 +++++++++++++++++++++++-------------------
 1 file changed, 95 insertions(+), 76 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d3813b1e74e2..fa5ee78c69c9 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -64,36 +64,30 @@ struct priority_group {
 
 /* Multipath context */
 struct multipath {
-	struct list_head list;
-	struct dm_target *ti;
-
-	const char *hw_handler_name;
-	char *hw_handler_params;
+	unsigned long flags;		/* Multipath state flags */
 
 	spinlock_t lock;
-
-	unsigned nr_priority_groups;
-	struct list_head priority_groups;
-
-	wait_queue_head_t pg_init_wait;	/* Wait for pg_init completion */
+	enum dm_queue_mode queue_mode;
 
 	struct pgpath *current_pgpath;
 	struct priority_group *current_pg;
 	struct priority_group *next_pg;	/* Switch to this PG if set */
 
-	unsigned long flags;		/* Multipath state flags */
+	atomic_t nr_valid_paths;	/* Total number of usable paths */
+	unsigned nr_priority_groups;
+	struct list_head priority_groups;
 
+	const char *hw_handler_name;
+	char *hw_handler_params;
+	wait_queue_head_t pg_init_wait;	/* Wait for pg_init completion */
 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
 	unsigned pg_init_delay_msecs;	/* Number of msecs before pg_init retry */
-
-	atomic_t nr_valid_paths;	/* Total number of usable paths */
 	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
 	atomic_t pg_init_count;		/* Number of times pg_init called */
 
-	enum dm_queue_mode queue_mode;
-
 	struct mutex work_mutex;
 	struct work_struct trigger_event;
+	struct dm_target *ti;
 
 	struct work_struct process_queued_bios;
 	struct bio_list queued_bios;
@@ -135,10 +129,10 @@ static struct pgpath *alloc_pgpath(void)
 {
 	struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
 
-	if (pgpath) {
-		pgpath->is_active = true;
-		INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
-	}
+	if (!pgpath)
+		return NULL;
+
+	pgpath->is_active = true;
 
 	return pgpath;
 }
@@ -193,13 +187,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 	if (m) {
 		INIT_LIST_HEAD(&m->priority_groups);
 		spin_lock_init(&m->lock);
-		set_bit(MPATHF_QUEUE_IO, &m->flags);
 		atomic_set(&m->nr_valid_paths, 0);
-		atomic_set(&m->pg_init_in_progress, 0);
-		atomic_set(&m->pg_init_count, 0);
-		m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
 		INIT_WORK(&m->trigger_event, trigger_event);
-		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
 
 		m->queue_mode = DM_TYPE_NONE;
@@ -235,6 +224,14 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
 		}
 	}
 
+	if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) {
+		set_bit(MPATHF_QUEUE_IO, &m->flags);
+		atomic_set(&m->pg_init_in_progress, 0);
+		atomic_set(&m->pg_init_count, 0);
+		m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
+		init_waitqueue_head(&m->pg_init_wait);
+	}
+
 	dm_table_set_type(ti->table, m->queue_mode);
 
 	return 0;
@@ -339,6 +336,9 @@ static void __switch_pg(struct multipath *m, struct priority_group *pg)
 {
 	m->current_pg = pg;
 
+	if (m->queue_mode == DM_TYPE_NVME_BIO_BASED)
+		return;
+
 	/* Must we initialise the PG first, and queue I/O till it's ready? */
 	if (m->hw_handler_name) {
 		set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
@@ -384,7 +384,8 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
 	unsigned bypassed = 1;
 
 	if (!atomic_read(&m->nr_valid_paths)) {
-		clear_bit(MPATHF_QUEUE_IO, &m->flags);
+		if (m->queue_mode != DM_TYPE_NVME_BIO_BASED)
+			clear_bit(MPATHF_QUEUE_IO, &m->flags);
 		goto failed;
 	}
 
@@ -528,8 +529,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 	clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
 	if (IS_ERR(clone)) {
 		/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
-		bool queue_dying = blk_queue_dying(q);
-		if (queue_dying) {
+		if (blk_queue_dying(q)) {
 			atomic_inc(&m->pg_init_in_progress);
 			activate_or_offline_path(pgpath);
 		}
@@ -563,21 +563,28 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 
 	/* Do we need to select a new pgpath? */
 	pgpath = READ_ONCE(m->current_pgpath);
+	/* MPATHF_QUEUE_IO will never be set for NVMe */
 	queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
 	if (!pgpath || !queue_io)
 		pgpath = choose_pgpath(m, mpio->nr_bytes);
 
-	if ((pgpath && queue_io) ||
-	    (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
+	if ((!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
+	    (pgpath && queue_io)) {
 		/* Queue for the daemon to resubmit */
 		spin_lock_irqsave(&m->lock, flags);
 		bio_list_add(&m->queued_bios, bio);
 		spin_unlock_irqrestore(&m->lock, flags);
-		/* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
-		if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
-			pg_init_all_paths(m);
-		else if (!queue_io)
+
+		if (m->queue_mode == DM_TYPE_NVME_BIO_BASED) {
 			queue_work(kmultipathd, &m->process_queued_bios);
+		} else {
+			/* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
+			if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
+				pg_init_all_paths(m);
+			else if (!queue_io)
+				queue_work(kmultipathd, &m->process_queued_bios);
+		}
+
 		return DM_MAPIO_SUBMITTED;
 	}
 
@@ -750,34 +757,11 @@ static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
 	return 0;
 }
 
-static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
-			       struct dm_target *ti)
+static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, char **error)
 {
-	int r;
-	struct pgpath *p;
-	struct multipath *m = ti->private;
-	struct request_queue *q = NULL;
+	struct request_queue *q = bdev_get_queue(bdev);
 	const char *attached_handler_name;
-
-	/* we need at least a path arg */
-	if (as->argc < 1) {
-		ti->error = "no device given";
-		return ERR_PTR(-EINVAL);
-	}
-
-	p = alloc_pgpath();
-	if (!p)
-		return ERR_PTR(-ENOMEM);
-
-	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
-			  &p->path.dev);
-	if (r) {
-		ti->error = "error getting device";
-		goto bad;
-	}
-
-	if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name)
-		q = bdev_get_queue(p->path.dev->bdev);
+	int r;
 
 	if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
 retain:
@@ -809,26 +793,59 @@ retain:
 			char b[BDEVNAME_SIZE];
 
 			printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
-				bdevname(p->path.dev->bdev, b));
+			       bdevname(bdev, b));
 			goto retain;
 		}
 		if (r < 0) {
-			ti->error = "error attaching hardware handler";
-			dm_put_device(ti, p->path.dev);
-			goto bad;
+			*error = "error attaching hardware handler";
+			return r;
 		}
 
 		if (m->hw_handler_params) {
 			r = scsi_dh_set_params(q, m->hw_handler_params);
 			if (r < 0) {
-				ti->error = "unable to set hardware "
-							"handler parameters";
-				dm_put_device(ti, p->path.dev);
-				goto bad;
+				*error = "unable to set hardware handler parameters";
+				return r;
 			}
 		}
 	}
 
+	return 0;
+}
+
+static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
+				 struct dm_target *ti)
+{
+	int r;
+	struct pgpath *p;
+	struct multipath *m = ti->private;
+
+	/* we need at least a path arg */
+	if (as->argc < 1) {
+		ti->error = "no device given";
+		return ERR_PTR(-EINVAL);
+	}
+
+	p = alloc_pgpath();
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+			  &p->path.dev);
+	if (r) {
+		ti->error = "error getting device";
+		goto bad;
+	}
+
+	if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) {
+		INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
+		r = setup_scsi_dh(p->path.dev->bdev, m, &ti->error);
+		if (r) {
+			dm_put_device(ti, p->path.dev);
+			goto bad;
+		}
+	}
+
 	r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
 	if (r) {
 		dm_put_device(ti, p->path.dev);
@@ -836,7 +853,6 @@ retain:
 	}
 
 	return p;
-
  bad:
 	free_pgpath(p);
 	return ERR_PTR(r);
@@ -1152,16 +1168,19 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
 
 static void flush_multipath_work(struct multipath *m)
 {
-	set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
-	smp_mb__after_atomic();
+	if (m->hw_handler_name) {
+		set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
+		smp_mb__after_atomic();
+
+		flush_workqueue(kmpath_handlerd);
+		multipath_wait_for_pg_init_completion(m);
+
+		clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
+		smp_mb__after_atomic();
+	}
 
-	flush_workqueue(kmpath_handlerd);
-	multipath_wait_for_pg_init_completion(m);
 	flush_workqueue(kmultipathd);
 	flush_work(&m->trigger_event);
-
-	clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
-	smp_mb__after_atomic();
 }
 
 static void multipath_dtr(struct dm_target *ti)
@@ -1537,7 +1556,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 }
 
 static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
-		blk_status_t *error)
+				blk_status_t *error)
 {
 	struct multipath *m = ti->private;
 	struct dm_mpath_io *mpio = get_mpio_from_bio(clone);

From 0001ec565db7ed01a6cc9575453900801bd0841b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 11 Dec 2017 11:02:29 -0500
Subject: [PATCH 41/69] dm mpath: factor out SCSI vs NVMe path selection

Trying to do both SCSI and NVMe bio-based handling with branching in the
same common code has proven too tedious on a code maintenance level.  In
addition it slightly hurts IO performance.

Fix this by factoring out __map_bio() and __map_bio_nvme().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 70 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 56 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index fa5ee78c69c9..6d1a9906c582 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -555,7 +555,8 @@ static void multipath_release_clone(struct request *clone)
 /*
  * Map cloned bios (bio-based multipath)
  */
-static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio)
+
+static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
 {
 	struct pgpath *pgpath;
 	unsigned long flags;
@@ -563,31 +564,72 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 
 	/* Do we need to select a new pgpath? */
 	pgpath = READ_ONCE(m->current_pgpath);
-	/* MPATHF_QUEUE_IO will never be set for NVMe */
 	queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
 	if (!pgpath || !queue_io)
-		pgpath = choose_pgpath(m, mpio->nr_bytes);
+		pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
 
-	if ((!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
-	    (pgpath && queue_io)) {
+	if ((pgpath && queue_io) ||
+	    (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
 		/* Queue for the daemon to resubmit */
 		spin_lock_irqsave(&m->lock, flags);
 		bio_list_add(&m->queued_bios, bio);
 		spin_unlock_irqrestore(&m->lock, flags);
 
-		if (m->queue_mode == DM_TYPE_NVME_BIO_BASED) {
+		/* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
+		if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
+			pg_init_all_paths(m);
+		else if (!queue_io)
 			queue_work(kmultipathd, &m->process_queued_bios);
-		} else {
-			/* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
-			if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
-				pg_init_all_paths(m);
-			else if (!queue_io)
-				queue_work(kmultipathd, &m->process_queued_bios);
-		}
 
-		return DM_MAPIO_SUBMITTED;
+		return ERR_PTR(-EAGAIN);
 	}
 
+	return pgpath;
+}
+
+static struct pgpath *__map_bio_nvme(struct multipath *m, struct bio *bio)
+{
+	struct pgpath *pgpath;
+	unsigned long flags;
+
+	/* Do we need to select a new pgpath? */
+	/*
+	 * FIXME: currently only switching path if no path (due to failure, etc)
+	 * - which negates the point of using a path selector
+	 */
+	pgpath = READ_ONCE(m->current_pgpath);
+	if (!pgpath)
+		pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
+
+	if (!pgpath) {
+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+			/* Queue for the daemon to resubmit */
+			spin_lock_irqsave(&m->lock, flags);
+			bio_list_add(&m->queued_bios, bio);
+			spin_unlock_irqrestore(&m->lock, flags);
+			queue_work(kmultipathd, &m->process_queued_bios);
+
+			return ERR_PTR(-EAGAIN);
+		}
+		return NULL;
+	}
+
+	return pgpath;
+}
+
+static int __multipath_map_bio(struct multipath *m, struct bio *bio,
+			       struct dm_mpath_io *mpio)
+{
+	struct pgpath *pgpath;
+
+	if (m->queue_mode == DM_TYPE_NVME_BIO_BASED)
+		pgpath = __map_bio_nvme(m, bio);
+	else
+		pgpath = __map_bio(m, bio);
+
+	if (IS_ERR(pgpath))
+		return DM_MAPIO_SUBMITTED;
+
 	if (!pgpath) {
 		if (must_push_back_bio(m))
 			return DM_MAPIO_REQUEUE;

From 18a5bf270532312178145b80c8893614367de106 Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Mon, 18 Dec 2017 10:28:08 -0700
Subject: [PATCH 42/69] dm: add unstriped target

This device mapper "unstriped" target remaps and unstripes I/O so it
is issued solely on a single drive in a HW RAID0 or dm-striped target.

In a 4 drive HW RAID0 the striped target exposes 1/4th of the LBA range
as a virtual drive.  Each I/O to that virtual drive will only be issued
to the 1 drive that was selected of the 4 drives in the HW RAID0.

This unstriped target is most useful for Intel NVMe drives that have
multiple cores but that do not have firmware control to pin separate LBA
ranges to each discrete cpu core.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/unstriped.txt | 124 ++++++++++++
 drivers/md/Kconfig                        |   7 +
 drivers/md/Makefile                       |   1 +
 drivers/md/dm-unstripe.c                  | 225 ++++++++++++++++++++++
 4 files changed, 357 insertions(+)
 create mode 100644 Documentation/device-mapper/unstriped.txt
 create mode 100644 drivers/md/dm-unstripe.c

diff --git a/Documentation/device-mapper/unstriped.txt b/Documentation/device-mapper/unstriped.txt
new file mode 100644
index 000000000000..0b2a306c54ee
--- /dev/null
+++ b/Documentation/device-mapper/unstriped.txt
@@ -0,0 +1,124 @@
+Introduction
+============
+
+The device-mapper "unstriped" target provides a transparent mechanism to
+unstripe a device-mapper "striped" target to access the underlying disks
+without having to touch the true backing block-device.  It can also be
+used to unstripe a hardware RAID-0 to access backing disks.
+
+Parameters:
+<number of stripes> <chunk size> <stripe #> <dev_path> <offset>
+
+<number of stripes>
+        The number of stripes in the RAID 0.
+
+<chunk size>
+	The amount of 512B sectors in the chunk striping.
+
+<dev_path>
+	The block device you wish to unstripe.
+
+<stripe #>
+        The stripe number within the device that corresponds to physical
+        drive you wish to unstripe.  This must be 0 indexed.
+
+
+Why use this module?
+====================
+
+An example of undoing an existing dm-stripe
+-------------------------------------------
+
+This small bash script will setup 4 loop devices and use the existing
+striped target to combine the 4 devices into one.  It then will use
+the unstriped target ontop of the striped device to access the
+individual backing loop devices.  We write data to the newly exposed
+unstriped devices and verify the data written matches the correct
+underlying device on the striped array.
+
+#!/bin/bash
+
+MEMBER_SIZE=$((128 * 1024 * 1024))
+NUM=4
+SEQ_END=$((${NUM}-1))
+CHUNK=256
+BS=4096
+
+RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512))
+DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}"
+COUNT=$((${MEMBER_SIZE} / ${BS}))
+
+for i in $(seq 0 ${SEQ_END}); do
+  dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct
+  losetup /dev/loop${i} member-${i}
+  DM_PARMS+=" /dev/loop${i} 0"
+done
+
+echo $DM_PARMS | dmsetup create raid0
+for i in $(seq 0 ${SEQ_END}); do
+  echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i}
+done;
+
+for i in $(seq 0 ${SEQ_END}); do
+  dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct
+  diff /dev/mapper/set-${i} member-${i}
+done;
+
+for i in $(seq 0 ${SEQ_END}); do
+  dmsetup remove set-${i}
+done
+
+dmsetup remove raid0
+
+for i in $(seq 0 ${SEQ_END}); do
+  losetup -d /dev/loop${i}
+  rm -f member-${i}
+done
+
+Another example
+---------------
+
+Intel NVMe drives contain two cores on the physical device.
+Each core of the drive has segregated access to its LBA range.
+The current LBA model has a RAID 0 128k chunk on each core, resulting
+in a 256k stripe across the two cores:
+
+   Core 0:       Core 1:
+  __________    __________
+  | LBA 512|    | LBA 768|
+  | LBA 0  |    | LBA 256|
+  ----------    ----------
+
+The purpose of this unstriping is to provide better QoS in noisy
+neighbor environments. When two partitions are created on the
+aggregate drive without this unstriping, reads on one partition
+can affect writes on another partition.  This is because the partitions
+are striped across the two cores.  When we unstripe this hardware RAID 0
+and make partitions on each new exposed device the two partitions are now
+physically separated.
+
+With the dm-unstriped target we're able to segregate an fio script that
+has read and write jobs that are independent of each other.  Compared to
+when we run the test on a combined drive with partitions, we were able
+to get a 92% reduction in read latency using this device mapper target.
+
+
+Example dmsetup usage
+=====================
+
+unstriped ontop of Intel NVMe device that has 2 cores
+-----------------------------------------------------
+dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0'
+dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0'
+
+There will now be two devices that expose Intel NVMe core 0 and 1
+respectively:
+/dev/mapper/nvmset0
+/dev/mapper/nvmset1
+
+unstriped ontop of striped with 4 drives using 128K chunk size
+--------------------------------------------------------------
+dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0'
+dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0'
+dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0'
+dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0'
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 83b9362be09c..2c8ac3688815 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -269,6 +269,13 @@ config DM_BIO_PRISON
 
 source "drivers/md/persistent-data/Kconfig"
 
+config DM_UNSTRIPED
+       tristate "Unstriped target"
+       depends on BLK_DEV_DM
+       ---help---
+	  Unstripes I/O so it is issued solely on a single drive in a HW
+	  RAID0 or dm-striped target.
+
 config DM_CRYPT
 	tristate "Crypt target support"
 	depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index f701bb211783..63255f3ebd97 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_BCACHE)		+= bcache/
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
+obj-$(CONFIG_DM_UNSTRIPED)	+= dm-unstripe.o
 obj-$(CONFIG_DM_BUFIO)		+= dm-bufio.o
 obj-$(CONFIG_DM_BIO_PRISON)	+= dm-bio-prison.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c
new file mode 100644
index 000000000000..061b4f10bf5c
--- /dev/null
+++ b/drivers/md/dm-unstripe.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (C) 2017 Intel Corporation.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/device-mapper.h>
+
+struct unstripe_c {
+	struct dm_dev *dev;
+	sector_t physical_start;
+
+	uint32_t stripes;
+
+	uint32_t unstripe;
+	sector_t unstripe_width;
+	sector_t unstripe_offset;
+
+	uint32_t chunk_size;
+	u8 chunk_shift;
+};
+
+#define DM_MSG_PREFIX "unstriped"
+
+static void cleanup_unstripe(struct unstripe_c *uc, struct dm_target *ti)
+{
+	if (uc->dev)
+		dm_put_device(ti, uc->dev);
+	kfree(uc);
+}
+
+/*
+ * Contruct an unstriped mapping.
+ * <number of stripes> <chunk size> <stripe #> <dev_path> <offset>
+ */
+static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct unstripe_c *uc;
+	sector_t width, tmp_len;
+	unsigned long long start;
+	char dummy;
+
+	if (argc != 5) {
+		ti->error = "Invalid number of arguments";
+		return -EINVAL;
+	}
+
+	uc = kzalloc(sizeof(*uc), GFP_KERNEL);
+	if (!uc) {
+		ti->error = "Memory allocation for unstriped context failed";
+		return -ENOMEM;
+	}
+
+	if (kstrtouint(argv[0], 10, &uc->stripes) || !uc->stripes) {
+		ti->error = "Invalid stripe count";
+		goto err;
+	}
+
+	if (kstrtouint(argv[1], 10, &uc->chunk_size) || !uc->chunk_size) {
+		ti->error = "Invalid chunk_size";
+		goto err;
+	}
+
+	// FIXME: must support non power of 2 chunk_size, dm-stripe.c does
+	if (!is_power_of_2(uc->chunk_size)) {
+		ti->error = "Non power of 2 chunk_size is not supported yet";
+		goto err;
+	}
+
+	if (kstrtouint(argv[2], 10, &uc->unstripe)) {
+		ti->error = "Invalid stripe number";
+		goto err;
+	}
+
+	if (uc->unstripe > uc->stripes && uc->stripes > 1) {
+		ti->error = "Please provide stripe between [0, # of stripes]";
+		goto err;
+	}
+
+	if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &uc->dev)) {
+		ti->error = "Couldn't get striped device";
+		goto err;
+	}
+
+	if (sscanf(argv[4], "%llu%c", &start, &dummy) != 1) {
+		ti->error = "Invalid striped device offset";
+		goto err;
+	}
+	uc->physical_start = start;
+
+	uc->unstripe_offset = uc->unstripe * uc->chunk_size;
+	uc->unstripe_width = (uc->stripes - 1) * uc->chunk_size;
+	uc->chunk_shift = fls(uc->chunk_size) - 1;
+
+	width = ti->len;
+	if (sector_div(width, uc->stripes)) {
+		ti->error = "Target length not divisible by number of stripes";
+		goto err;
+	}
+
+	tmp_len = width;
+	if (sector_div(tmp_len, uc->chunk_size)) {
+		ti->error = "Target length not divisible by chunk size";
+		goto err;
+	}
+
+	if (dm_set_target_max_io_len(ti, uc->chunk_size)) {
+		ti->error = "Failed to set max io len";
+		goto err;
+	}
+
+	ti->private = uc;
+	return 0;
+err:
+	cleanup_unstripe(uc, ti);
+	return -EINVAL;
+}
+
+static void unstripe_dtr(struct dm_target *ti)
+{
+	struct unstripe_c *uc = ti->private;
+
+	cleanup_unstripe(uc, ti);
+}
+
+static sector_t map_to_core(struct dm_target *ti, struct bio *bio)
+{
+	struct unstripe_c *uc = ti->private;
+	sector_t sector = bio->bi_iter.bi_sector;
+
+	/* Shift us up to the right "row" on the stripe */
+	sector += uc->unstripe_width * (sector >> uc->chunk_shift);
+
+	/* Account for what stripe we're operating on */
+	sector += uc->unstripe_offset;
+
+	return sector;
+}
+
+static int unstripe_map(struct dm_target *ti, struct bio *bio)
+{
+	struct unstripe_c *uc = ti->private;
+
+	bio_set_dev(bio, uc->dev->bdev);
+	bio->bi_iter.bi_sector = map_to_core(ti, bio) + uc->physical_start;
+
+	return DM_MAPIO_REMAPPED;
+}
+
+static void unstripe_status(struct dm_target *ti, status_type_t type,
+			    unsigned int status_flags, char *result, unsigned int maxlen)
+{
+	struct unstripe_c *uc = ti->private;
+	unsigned int sz = 0;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%d %llu %d %s %llu",
+		       uc->stripes, (unsigned long long)uc->chunk_size, uc->unstripe,
+		       uc->dev->name, (unsigned long long)uc->physical_start);
+		break;
+	}
+}
+
+static int unstripe_iterate_devices(struct dm_target *ti,
+				    iterate_devices_callout_fn fn, void *data)
+{
+	struct unstripe_c *uc = ti->private;
+
+	return fn(ti, uc->dev, uc->physical_start, ti->len, data);
+}
+
+static void unstripe_io_hints(struct dm_target *ti,
+			       struct queue_limits *limits)
+{
+	struct unstripe_c *uc = ti->private;
+
+	limits->chunk_sectors = uc->chunk_size;
+}
+
+static struct target_type unstripe_target = {
+	.name = "unstriped",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = unstripe_ctr,
+	.dtr = unstripe_dtr,
+	.map = unstripe_map,
+	.status = unstripe_status,
+	.iterate_devices = unstripe_iterate_devices,
+	.io_hints = unstripe_io_hints,
+};
+
+static int __init dm_unstripe_init(void)
+{
+	int r;
+
+	r = dm_register_target(&unstripe_target);
+	if (r < 0)
+		DMERR("target registration failed");
+
+	return r;
+}
+
+static void __exit dm_unstripe_exit(void)
+{
+	dm_unregister_target(&unstripe_target);
+}
+
+module_init(dm_unstripe_init);
+module_exit(dm_unstripe_exit);
+
+MODULE_DESCRIPTION(DM_NAME " unstriped target");
+MODULE_AUTHOR("Scott Bauer <scott.bauer@intel.com>");
+MODULE_LICENSE("GPL");

From 905be0a121d931132e081784930fc7d7c8d58071 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sat, 2 Dec 2017 00:33:39 -0500
Subject: [PATCH 43/69] dm bufio: use REQ_OP_READ and REQ_OP_WRITE

Use REQ_OP_READ and REQ_OP_WRITE macros instead of READ and WRITE.  They
have the same value, but the block layer uses REQ_OP so bufio should
too.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index c546b567f3b5..951a6356fbec 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -662,7 +662,7 @@ static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io)
 
 	sector = (b->block << b->c->sectors_per_block_bits) + b->c->start;
 
-	if (rw != WRITE) {
+	if (rw != REQ_OP_WRITE) {
 		n_sectors = 1 << b->c->sectors_per_block_bits;
 		offset = 0;
 	} else {
@@ -740,7 +740,7 @@ static void __write_dirty_buffer(struct dm_buffer *b,
 	b->write_end = b->dirty_end;
 
 	if (!write_list)
-		submit_io(b, WRITE, write_endio);
+		submit_io(b, REQ_OP_WRITE, write_endio);
 	else
 		list_add_tail(&b->write_list, write_list);
 }
@@ -753,7 +753,7 @@ static void __flush_write_list(struct list_head *write_list)
 		struct dm_buffer *b =
 			list_entry(write_list->next, struct dm_buffer, write_list);
 		list_del(&b->write_list);
-		submit_io(b, WRITE, write_endio);
+		submit_io(b, REQ_OP_WRITE, write_endio);
 		cond_resched();
 	}
 	blk_finish_plug(&plug);
@@ -1123,7 +1123,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 		return NULL;
 
 	if (need_submit)
-		submit_io(b, READ, read_endio);
+		submit_io(b, REQ_OP_READ, read_endio);
 
 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
 
@@ -1193,7 +1193,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 			dm_bufio_unlock(c);
 
 			if (need_submit)
-				submit_io(b, READ, read_endio);
+				submit_io(b, REQ_OP_READ, read_endio);
 			dm_bufio_release(b);
 
 			cond_resched();
@@ -1454,7 +1454,7 @@ retry:
 		old_block = b->block;
 		__unlink_buffer(b);
 		__link_buffer(b, new_block, b->list_mode);
-		submit_io(b, WRITE, write_endio);
+		submit_io(b, REQ_OP_WRITE, write_endio);
 		wait_on_bit_io(&b->state, B_WRITING,
 			       TASK_UNINTERRUPTIBLE);
 		__unlink_buffer(b);

From bde14184781bd24ee6fb0e1af8d69ca21acbd6e6 Mon Sep 17 00:00:00 2001
From: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Date: Sat, 23 Dec 2017 13:27:03 +0300
Subject: [PATCH 44/69] dm bufio: add missed destroys of client mutex

The client's mutex needs to be destroyed in dm_bufio_client_destroy() as
well as the dm_bufio_client_create() error path.

Signed-off-by: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 951a6356fbec..1d130130f264 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1767,6 +1767,7 @@ bad_cache:
 	}
 	dm_io_client_destroy(c->dm_io);
 bad_dm_io:
+	mutex_destroy(&c->lock);
 	kfree(c);
 bad_client:
 	return ERR_PTR(r);
@@ -1811,6 +1812,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 		BUG_ON(c->n_buffers[i]);
 
 	dm_io_client_destroy(c->dm_io);
+	mutex_destroy(&c->lock);
 	kfree(c);
 }
 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);

From 46898e9a7ac9fb2d0d108cc7b753abccbf9d9f20 Mon Sep 17 00:00:00 2001
From: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Date: Sat, 23 Dec 2017 13:27:04 +0300
Subject: [PATCH 45/69] dm bufio: check result of register_shrinker()

dm_bufio_client_create() does not check result of register_shrinker()
which was tagged as __must_check recently, reported by sparse.

Signed-off-by: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 1d130130f264..b98c72fa7322 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1743,20 +1743,23 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		__free_buffer_wake(b);
 	}
 
+	c->shrinker.count_objects = dm_bufio_shrink_count;
+	c->shrinker.scan_objects = dm_bufio_shrink_scan;
+	c->shrinker.seeks = 1;
+	c->shrinker.batch = 0;
+	r = register_shrinker(&c->shrinker);
+	if (r)
+		goto bad_shrinker;
+
 	mutex_lock(&dm_bufio_clients_lock);
 	dm_bufio_client_count++;
 	list_add(&c->client_list, &dm_bufio_all_clients);
 	__cache_size_refresh();
 	mutex_unlock(&dm_bufio_clients_lock);
 
-	c->shrinker.count_objects = dm_bufio_shrink_count;
-	c->shrinker.scan_objects = dm_bufio_shrink_scan;
-	c->shrinker.seeks = 1;
-	c->shrinker.batch = 0;
-	register_shrinker(&c->shrinker);
-
 	return c;
 
+bad_shrinker:
 bad_buffer:
 bad_cache:
 	while (!list_empty(&c->reserved_buffers)) {

From 0e696d385d4be83055d8281f7e089a1b9ea18a1a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 4 Jan 2018 12:14:57 -0500
Subject: [PATCH 46/69] dm bufio: eliminate unnecessary labels in
 dm_bufio_client_create()

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index b98c72fa7322..414c9af54ded 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1716,7 +1716,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 			if (!DM_BUFIO_CACHE_NAME(c)) {
 				r = -ENOMEM;
 				mutex_unlock(&dm_bufio_clients_lock);
-				goto bad_cache;
+				goto bad;
 			}
 		}
 
@@ -1727,7 +1727,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 			if (!DM_BUFIO_CACHE(c)) {
 				r = -ENOMEM;
 				mutex_unlock(&dm_bufio_clients_lock);
-				goto bad_cache;
+				goto bad;
 			}
 		}
 	}
@@ -1738,7 +1738,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 
 		if (!b) {
 			r = -ENOMEM;
-			goto bad_buffer;
+			goto bad;
 		}
 		__free_buffer_wake(b);
 	}
@@ -1749,7 +1749,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	c->shrinker.batch = 0;
 	r = register_shrinker(&c->shrinker);
 	if (r)
-		goto bad_shrinker;
+		goto bad;
 
 	mutex_lock(&dm_bufio_clients_lock);
 	dm_bufio_client_count++;
@@ -1759,9 +1759,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 
 	return c;
 
-bad_shrinker:
-bad_buffer:
-bad_cache:
+bad:
 	while (!list_empty(&c->reserved_buffers)) {
 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
 						 struct dm_buffer, lru_list);

From 67ac901c553bab4bcc05ed1253829bf462c26b1f Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Tue, 2 Jan 2018 06:18:10 +0000
Subject: [PATCH 47/69] dm raid: make raid_sets symbol static

Fixes the following sparse warning:

drivers/md/dm-raid.c:33:1: warning:
 symbol 'raid_sets' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 7d7dc1723180..d46d1945fbcc 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -30,7 +30,7 @@
 #define	MIN_RAID456_JOURNAL_SPACE (4*2048)
 
 /* Global list of all raid sets */
-LIST_HEAD(raid_sets);
+static LIST_HEAD(raid_sets);
 
 static bool devices_handle_discard_safely = false;
 

From f6e7baadd96bd746d3fb584959d3f152189d05e1 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Tue, 28 Mar 2017 11:31:02 -0700
Subject: [PATCH 48/69] dm: move dm_table_destroy() to same header as
 dm_table_create()

If anyone is going to use dm_table_create(), they probably should be
able to use dm_table_destroy() too. Move the dm_table_destroy()
definition outside the private header, near dm_table_create()

Signed-off-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.h               | 1 -
 include/linux/device-mapper.h | 5 +++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 7c66c316add3..114a81b27c37 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -49,7 +49,6 @@ struct dm_md_mempools;
 /*-----------------------------------------------------------------
  * Internal table functions.
  *---------------------------------------------------------------*/
-void dm_table_destroy(struct dm_table *t);
 void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 41ec228b02a6..9ba84532947d 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -459,6 +459,11 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type);
  */
 int dm_table_complete(struct dm_table *t);
 
+/*
+ * Destroy the table when finished.
+ */
+void dm_table_destroy(struct dm_table *t);
+
 /*
  * Target may require that it is never sent I/O larger than len.
  */

From 424da29c5a85adfcc1eda9412516efd77355d469 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 2 Dec 2015 12:32:49 -0500
Subject: [PATCH 49/69] dm snapshot: improve documentation relative to origin
 suspend requirements

Add a note to snapshot.txt that the origin target must be suspended when
loading or unloading the snapshot target.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/snapshot.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt
index ad6949bff2e3..b8bbb516f989 100644
--- a/Documentation/device-mapper/snapshot.txt
+++ b/Documentation/device-mapper/snapshot.txt
@@ -49,6 +49,10 @@ The difference between persistent and transient is with transient
 snapshots less metadata must be saved on disk - they can be kept in
 memory by the kernel.
 
+When loading or unloading the snapshot target, the corresponding
+snapshot-origin or snapshot-merge target must be suspended. A failure to
+suspend the origin target could result in data corruption.
+
 
 * snapshot-merge <origin> <COW device> <persistent> <chunksize>
 

From 677210462daf48dbf69a5bd2e38deb271d1a41bf Mon Sep 17 00:00:00 2001
From: mulhern <amulhern@redhat.com>
Date: Mon, 27 Nov 2017 10:02:40 -0500
Subject: [PATCH 50/69] dm cache: fix grammar in cache-policies.txt

Use possessive pronoun where appropriate, instead of contraction.

Signed-off-by: mulhern <amulhern@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/cache-policies.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt
index d3ca8af21a31..86786d87d9a8 100644
--- a/Documentation/device-mapper/cache-policies.txt
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -60,7 +60,7 @@ Memory usage:
 The mq policy used a lot of memory; 88 bytes per cache block on a 64
 bit machine.
 
-smq uses 28bit indexes to implement it's data structures rather than
+smq uses 28bit indexes to implement its data structures rather than
 pointers.  It avoids storing an explicit hit count for each block.  It
 has a 'hotspot' queue, rather than a pre-cache, which uses a quarter of
 the entries (each hotspot block covers a larger area than a single
@@ -84,7 +84,7 @@ resulting in better promotion/demotion decisions.
 
 Adaptability:
 The mq policy maintained a hit count for each cache block.  For a
-different block to get promoted to the cache it's hit count has to
+different block to get promoted to the cache its hit count has to
 exceed the lowest currently in the cache.  This meant it could take a
 long time for the cache to adapt between varying IO patterns.
 

From 3716e20af5b583c3e15661aab657168176baa01e Mon Sep 17 00:00:00 2001
From: mulhern <amulhern@redhat.com>
Date: Mon, 27 Nov 2017 10:02:41 -0500
Subject: [PATCH 51/69] dm cache: delete obsoleted paragraph in cache.txt

The 'mq' policy is no longer the default policy, and the default policy,
'smq', does not store hit counts.

Signed-off-by: mulhern <amulhern@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/cache.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index cdfd0feb294e..043524409221 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -143,11 +143,6 @@ the policy how big this chunk is, but it should be kept small.  Like the
 dirty flags this data is lost if there's a crash so a safe fallback
 value should always be possible.
 
-For instance, the 'mq' policy, which is currently the default policy,
-uses this facility to store the hit count of the cache blocks.  If
-there's a crash this information will be lost, which means the cache
-may be less efficient until those hit counts are regenerated.
-
 Policy hints affect performance, not correctness.
 
 Policy messaging

From 1346638e5f80588dfafbf07315f72b84c2ab1113 Mon Sep 17 00:00:00 2001
From: mulhern <amulhern@redhat.com>
Date: Mon, 27 Nov 2017 10:02:42 -0500
Subject: [PATCH 52/69] dm cache: be consistent in specifying sectors and SI
 units in cache.txt

Signed-off-by: mulhern <amulhern@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/cache.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index 043524409221..79c7b6dc88ae 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -59,7 +59,7 @@ Fixed block size
 The origin is divided up into blocks of a fixed size.  This block size
 is configurable when you first create the cache.  Typically we've been
 using block sizes of 256KB - 1024KB.  The block size must be between 64
-(32KB) and 2097152 (1GB) and a multiple of 64 (32KB).
+sectors (32KB) and 2097152 sectors (1GB) and a multiple of 64 sectors (32KB).
 
 Having a fixed block size simplifies the target a lot.  But it is
 something of a compromise.  For instance, a small part of a block may be
@@ -119,7 +119,7 @@ doing here to avoid migrating during those peak io moments.
 
 For the time being, a message "migration_threshold <#sectors>"
 can be used to set the maximum number of sectors being migrated,
-the default being 204800 sectors (or 100MB).
+the default being 204800 sectors (100MB).
 
 Updating on-disk metadata
 -------------------------

From 9b28a1102efc75d81298198166ead87d643a29ce Mon Sep 17 00:00:00 2001
From: mulhern <amulhern@redhat.com>
Date: Mon, 27 Nov 2017 10:02:39 -0500
Subject: [PATCH 53/69] dm thin: fix documentation relative to low water mark
 threshold

Fixes:
1. The use of "exceeds" when the opposite of exceeds, falls below,
was meant.
2. Properly speaking, a table can not exceed a threshold.

It emphasizes the important point, which is that it is the userspace
daemon's responsibility to check for low free space when a device
is resumed, since it won't get a special event indicating low free
space in that situation.

Signed-off-by: mulhern <amulhern@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/thin-provisioning.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 1699a55b7b70..ef639960b272 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -112,9 +112,11 @@ $low_water_mark is expressed in blocks of size $data_block_size.  If
 free space on the data device drops below this level then a dm event
 will be triggered which a userspace daemon should catch allowing it to
 extend the pool device.  Only one such event will be sent.
-Resuming a device with a new table itself triggers an event so the
-userspace daemon can use this to detect a situation where a new table
-already exceeds the threshold.
+
+No special event is triggered if a just resumed device's free space is below
+the low water mark. However, resuming a device always triggers an
+event; a userspace daemon should verify that free space exceeds the low
+water mark when handling this event.
 
 A low water mark for the metadata device is maintained in the kernel and
 will trigger a dm event if free space on the metadata device drops below

From 2bc8a61c696e41eef27ab7ff3bcfe75d2daf573a Mon Sep 17 00:00:00 2001
From: mulhern <amulhern@redhat.com>
Date: Mon, 27 Nov 2017 10:02:43 -0500
Subject: [PATCH 54/69] dm thin: document representation of <highest mapped
 sector> when there is none

Signed-off-by: mulhern <amulhern@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/thin-provisioning.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index ef639960b272..3852dade03e4 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -396,3 +396,6 @@ ii) Status
 	If the pool has encountered device errors and failed, the status
 	will just contain the string 'Fail'.  The userspace recovery
 	tools should then be used.
+
+    In the case where <nr mapped sectors> is 0, there is no highest
+    mapped sector and the value of <highest mapped sector> is unspecified.

From cc3ff0af19499f0aaa587f42d3091b4dff49208f Mon Sep 17 00:00:00 2001
From: mulhern <amulhern@redhat.com>
Date: Mon, 27 Nov 2017 10:02:44 -0500
Subject: [PATCH 55/69] dm thin: fixes in thin-provisioning.txt

Make the format string for thinpool status more correct.

Swap the order of two items to correspond with reality.

Signed-off-by: mulhern <amulhern@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/thin-provisioning.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 3852dade03e4..fb10d187090e 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -276,7 +276,7 @@ ii) Status
 
     <transaction id> <used metadata blocks>/<total metadata blocks>
     <used data blocks>/<total data blocks> <held metadata root>
-    [no_]discard_passdown ro|rw
+    ro|rw|out_of_data_space [no_]discard_passdown
 
     transaction id:
 	A 64-bit number used by userspace to help synchronise with metadata

From 7efd5fed6fd713e206ace4b71116d0a5d719fb0a Mon Sep 17 00:00:00 2001
From: mulhern <amulhern@redhat.com>
Date: Mon, 27 Nov 2017 10:02:45 -0500
Subject: [PATCH 56/69] dm thin: extend thinpool status format string with
 omitted fields

Signed-off-by: mulhern <amulhern@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/thin-provisioning.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index fb10d187090e..4bcd4b7f79f9 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -276,7 +276,8 @@ ii) Status
 
     <transaction id> <used metadata blocks>/<total metadata blocks>
     <used data blocks>/<total data blocks> <held metadata root>
-    ro|rw|out_of_data_space [no_]discard_passdown
+    ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space
+    needs_check|-
 
     transaction id:
 	A 64-bit number used by userspace to help synchronise with metadata

From 7690e25302dc7d0cd42b349e746fe44b44a94f2b Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sun, 3 Dec 2017 21:14:12 -0600
Subject: [PATCH 57/69] dm flakey: check for null arg_name in parse_features()

One can crash dm-flakey by specifying more feature arguments than the
number of features supplied.  Checking for null in arg_name avoids
this.

dmsetup create flakey-test --table "0 66076080 flakey /dev/sdb9 0 0 180 2 drop_writes"

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-flakey.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index b82cb1ab1eaa..1b907b15f5c3 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -70,6 +70,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 		arg_name = dm_shift_arg(as);
 		argc--;
 
+		if (!arg_name) {
+			ti->error = "Insufficient feature arguments";
+			return -EINVAL;
+		}
+
 		/*
 		 * drop_writes
 		 */

From ae1093be5a0ef997833e200a0dafb9ed0b1ff4fe Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 23 Nov 2017 16:15:43 -0500
Subject: [PATCH 58/69] dm snapshot: use mutex instead of rw_semaphore

The rw_semaphore is acquired for read only in two places, neither is
performance-critical.  So replace it with a mutex -- which is more
efficient.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-snap.c | 84 +++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a0613bd8ed00..216035be5661 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -47,7 +47,7 @@ struct dm_exception_table {
 };
 
 struct dm_snapshot {
-	struct rw_semaphore lock;
+	struct mutex lock;
 
 	struct dm_dev *origin;
 	struct dm_dev *cow;
@@ -439,9 +439,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
 		if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
 			continue;
 
-		down_read(&s->lock);
+		mutex_lock(&s->lock);
 		active = s->active;
-		up_read(&s->lock);
+		mutex_unlock(&s->lock);
 
 		if (active) {
 			if (snap_src)
@@ -909,7 +909,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
 	int r;
 	chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
 
-	down_write(&s->lock);
+	mutex_lock(&s->lock);
 
 	/*
 	 * Process chunks (and associated exceptions) in reverse order
@@ -924,7 +924,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
 	b = __release_queued_bios_after_merge(s);
 
 out:
-	up_write(&s->lock);
+	mutex_unlock(&s->lock);
 	if (b)
 		flush_bios(b);
 
@@ -983,9 +983,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 		if (linear_chunks < 0) {
 			DMERR("Read error in exception store: "
 			      "shutting down merge");
-			down_write(&s->lock);
+			mutex_lock(&s->lock);
 			s->merge_failed = 1;
-			up_write(&s->lock);
+			mutex_unlock(&s->lock);
 		}
 		goto shut;
 	}
@@ -1026,10 +1026,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 		previous_count = read_pending_exceptions_done_count();
 	}
 
-	down_write(&s->lock);
+	mutex_lock(&s->lock);
 	s->first_merging_chunk = old_chunk;
 	s->num_merging_chunks = linear_chunks;
-	up_write(&s->lock);
+	mutex_unlock(&s->lock);
 
 	/* Wait until writes to all 'linear_chunks' drain */
 	for (i = 0; i < linear_chunks; i++)
@@ -1071,10 +1071,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
 	return;
 
 shut:
-	down_write(&s->lock);
+	mutex_lock(&s->lock);
 	s->merge_failed = 1;
 	b = __release_queued_bios_after_merge(s);
-	up_write(&s->lock);
+	mutex_unlock(&s->lock);
 	error_bios(b);
 
 	merge_shutdown(s);
@@ -1173,7 +1173,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	s->exception_start_sequence = 0;
 	s->exception_complete_sequence = 0;
 	INIT_LIST_HEAD(&s->out_of_order_list);
-	init_rwsem(&s->lock);
+	mutex_init(&s->lock);
 	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
 	s->state_bits = 0;
@@ -1338,9 +1338,9 @@ static void snapshot_dtr(struct dm_target *ti)
 	/* Check whether exception handover must be cancelled */
 	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
 	if (snap_src && snap_dest && (s == snap_src)) {
-		down_write(&snap_dest->lock);
+		mutex_lock(&snap_dest->lock);
 		snap_dest->valid = 0;
-		up_write(&snap_dest->lock);
+		mutex_unlock(&snap_dest->lock);
 		DMERR("Cancelling snapshot handover.");
 	}
 	up_read(&_origins_lock);
@@ -1371,6 +1371,8 @@ static void snapshot_dtr(struct dm_target *ti)
 
 	dm_exception_store_destroy(s->store);
 
+	mutex_destroy(&s->lock);
+
 	dm_put_device(ti, s->cow);
 
 	dm_put_device(ti, s->origin);
@@ -1458,7 +1460,7 @@ static void pending_complete(void *context, int success)
 
 	if (!success) {
 		/* Read/write error - snapshot is unusable */
-		down_write(&s->lock);
+		mutex_lock(&s->lock);
 		__invalidate_snapshot(s, -EIO);
 		error = 1;
 		goto out;
@@ -1466,14 +1468,14 @@ static void pending_complete(void *context, int success)
 
 	e = alloc_completed_exception(GFP_NOIO);
 	if (!e) {
-		down_write(&s->lock);
+		mutex_lock(&s->lock);
 		__invalidate_snapshot(s, -ENOMEM);
 		error = 1;
 		goto out;
 	}
 	*e = pe->e;
 
-	down_write(&s->lock);
+	mutex_lock(&s->lock);
 	if (!s->valid) {
 		free_completed_exception(e);
 		error = 1;
@@ -1498,7 +1500,7 @@ out:
 		full_bio->bi_end_io = pe->full_bio_end_io;
 	increment_pending_exceptions_done_count();
 
-	up_write(&s->lock);
+	mutex_unlock(&s->lock);
 
 	/* Submit any pending write bios */
 	if (error) {
@@ -1694,7 +1696,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 
 	/* FIXME: should only take write lock if we need
 	 * to copy an exception */
-	down_write(&s->lock);
+	mutex_lock(&s->lock);
 
 	if (!s->valid || (unlikely(s->snapshot_overflowed) &&
 	    bio_data_dir(bio) == WRITE)) {
@@ -1717,9 +1719,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 	if (bio_data_dir(bio) == WRITE) {
 		pe = __lookup_pending_exception(s, chunk);
 		if (!pe) {
-			up_write(&s->lock);
+			mutex_unlock(&s->lock);
 			pe = alloc_pending_exception(s);
-			down_write(&s->lock);
+			mutex_lock(&s->lock);
 
 			if (!s->valid || s->snapshot_overflowed) {
 				free_pending_exception(pe);
@@ -1754,7 +1756,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 		    bio->bi_iter.bi_size ==
 		    (s->store->chunk_size << SECTOR_SHIFT)) {
 			pe->started = 1;
-			up_write(&s->lock);
+			mutex_unlock(&s->lock);
 			start_full_bio(pe, bio);
 			goto out;
 		}
@@ -1764,7 +1766,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 		if (!pe->started) {
 			/* this is protected by snap->lock */
 			pe->started = 1;
-			up_write(&s->lock);
+			mutex_unlock(&s->lock);
 			start_copy(pe);
 			goto out;
 		}
@@ -1774,7 +1776,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 	}
 
 out_unlock:
-	up_write(&s->lock);
+	mutex_unlock(&s->lock);
 out:
 	return r;
 }
@@ -1810,7 +1812,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 
 	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
 
-	down_write(&s->lock);
+	mutex_lock(&s->lock);
 
 	/* Full merging snapshots are redirected to the origin */
 	if (!s->valid)
@@ -1841,12 +1843,12 @@ redirect_to_origin:
 	bio_set_dev(bio, s->origin->bdev);
 
 	if (bio_data_dir(bio) == WRITE) {
-		up_write(&s->lock);
+		mutex_unlock(&s->lock);
 		return do_origin(s->origin, bio);
 	}
 
 out_unlock:
-	up_write(&s->lock);
+	mutex_unlock(&s->lock);
 
 	return r;
 }
@@ -1878,7 +1880,7 @@ static int snapshot_preresume(struct dm_target *ti)
 	down_read(&_origins_lock);
 	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
 	if (snap_src && snap_dest) {
-		down_read(&snap_src->lock);
+		mutex_lock(&snap_src->lock);
 		if (s == snap_src) {
 			DMERR("Unable to resume snapshot source until "
 			      "handover completes.");
@@ -1888,7 +1890,7 @@ static int snapshot_preresume(struct dm_target *ti)
 			      "source is suspended.");
 			r = -EINVAL;
 		}
-		up_read(&snap_src->lock);
+		mutex_unlock(&snap_src->lock);
 	}
 	up_read(&_origins_lock);
 
@@ -1934,11 +1936,11 @@ static void snapshot_resume(struct dm_target *ti)
 
 	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
 	if (snap_src && snap_dest) {
-		down_write(&snap_src->lock);
-		down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
+		mutex_lock(&snap_src->lock);
+		mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
 		__handover_exceptions(snap_src, snap_dest);
-		up_write(&snap_dest->lock);
-		up_write(&snap_src->lock);
+		mutex_unlock(&snap_dest->lock);
+		mutex_unlock(&snap_src->lock);
 	}
 
 	up_read(&_origins_lock);
@@ -1953,9 +1955,9 @@ static void snapshot_resume(struct dm_target *ti)
 	/* Now we have correct chunk size, reregister */
 	reregister_snapshot(s);
 
-	down_write(&s->lock);
+	mutex_lock(&s->lock);
 	s->active = 1;
-	up_write(&s->lock);
+	mutex_unlock(&s->lock);
 }
 
 static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
@@ -1995,7 +1997,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
 	switch (type) {
 	case STATUSTYPE_INFO:
 
-		down_write(&snap->lock);
+		mutex_lock(&snap->lock);
 
 		if (!snap->valid)
 			DMEMIT("Invalid");
@@ -2020,7 +2022,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
 				DMEMIT("Unknown");
 		}
 
-		up_write(&snap->lock);
+		mutex_unlock(&snap->lock);
 
 		break;
 
@@ -2086,7 +2088,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
 		if (dm_target_is_snapshot_merge(snap->ti))
 			continue;
 
-		down_write(&snap->lock);
+		mutex_lock(&snap->lock);
 
 		/* Only deal with valid and active snapshots */
 		if (!snap->valid || !snap->active)
@@ -2113,9 +2115,9 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
 
 		pe = __lookup_pending_exception(snap, chunk);
 		if (!pe) {
-			up_write(&snap->lock);
+			mutex_unlock(&snap->lock);
 			pe = alloc_pending_exception(snap);
-			down_write(&snap->lock);
+			mutex_lock(&snap->lock);
 
 			if (!snap->valid) {
 				free_pending_exception(pe);
@@ -2158,7 +2160,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
 		}
 
 next_snapshot:
-		up_write(&snap->lock);
+		mutex_unlock(&snap->lock);
 
 		if (pe_to_start_now) {
 			start_copy(pe_to_start_now);

From d5ffebdd797a7c1c89576267640f671db2a668fc Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 5 Jan 2018 21:17:20 -0500
Subject: [PATCH 59/69] dm: backfill missing calls to mutex_destroy()

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c          | 2 ++
 drivers/md/dm-delay.c          | 2 ++
 drivers/md/dm-kcopyd.c         | 6 ++++--
 drivers/md/dm-mpath.c          | 1 +
 drivers/md/dm-stats.c          | 1 +
 drivers/md/dm-thin.c           | 7 +++++++
 drivers/md/dm-zoned-metadata.c | 3 +++
 drivers/md/dm-zoned-target.c   | 3 +++
 drivers/md/dm.c                | 4 ++++
 9 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9c53367d2f3e..09f4ff39269b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2192,6 +2192,8 @@ static void crypt_dtr(struct dm_target *ti)
 	kzfree(cc->cipher_auth);
 	kzfree(cc->authenc_key);
 
+	mutex_destroy(&cc->bio_alloc_lock);
+
 	/* Must zero key material before freeing */
 	kzfree(cc);
 }
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 288386bfbfb5..1783d80c9cad 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -229,6 +229,8 @@ static void delay_dtr(struct dm_target *ti)
 	if (dc->dev_write)
 		dm_put_device(ti, dc->dev_write);
 
+	mutex_destroy(&dc->timer_lock);
+
 	kfree(dc);
 }
 
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index eb45cc3df31d..e6e7c686646d 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -477,8 +477,10 @@ static int run_complete_job(struct kcopyd_job *job)
 	 * If this is the master job, the sub jobs have already
 	 * completed so we can free everything.
 	 */
-	if (job->master_job == job)
+	if (job->master_job == job) {
+		mutex_destroy(&job->lock);
 		mempool_free(job, kc->job_pool);
+	}
 	fn(read_err, write_err, context);
 
 	if (atomic_dec_and_test(&kc->nr_jobs))
@@ -750,6 +752,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	 * followed by SPLIT_COUNT sub jobs.
 	 */
 	job = mempool_alloc(kc->job_pool, GFP_NOIO);
+	mutex_init(&job->lock);
 
 	/*
 	 * set up for the read.
@@ -811,7 +814,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	if (job->source.count <= SUB_JOB_SIZE)
 		dispatch_job(job);
 	else {
-		mutex_init(&job->lock);
 		job->progress = 0;
 		split_job(job);
 	}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6d1a9906c582..be581765edd1 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -248,6 +248,7 @@ static void free_multipath(struct multipath *m)
 
 	kfree(m->hw_handler_name);
 	kfree(m->hw_handler_params);
+	mutex_destroy(&m->work_mutex);
 	kfree(m);
 }
 
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 29bc51084c82..56059fb56e2d 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -228,6 +228,7 @@ void dm_stats_cleanup(struct dm_stats *stats)
 		dm_stat_free(&s->rcu_head);
 	}
 	free_percpu(stats->last);
+	mutex_destroy(&stats->mutex);
 }
 
 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index f91d771fff4b..c1c6160be355 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -492,6 +492,11 @@ static void pool_table_init(void)
 	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 }
 
+static void pool_table_exit(void)
+{
+	mutex_destroy(&dm_thin_pool_table.mutex);
+}
+
 static void __pool_table_insert(struct pool *pool)
 {
 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
@@ -4387,6 +4392,8 @@ static void dm_thin_exit(void)
 	dm_unregister_target(&pool_target);
 
 	kmem_cache_destroy(_new_mapping_cache);
+
+	pool_table_exit();
 }
 
 module_init(dm_thin_init);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 70485de37b66..969954915566 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -2333,6 +2333,9 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
 
 	/* Free the zone descriptors */
 	dmz_drop_zones(zmd);
+
+	mutex_destroy(&zmd->mblk_flush_lock);
+	mutex_destroy(&zmd->map_lock);
 }
 
 /*
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 6d7bda6f8190..caff02caf083 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -827,6 +827,7 @@ err_fwq:
 err_cwq:
 	destroy_workqueue(dmz->chunk_wq);
 err_bio:
+	mutex_destroy(&dmz->chunk_lock);
 	bioset_free(dmz->bio_set);
 err_meta:
 	dmz_dtr_metadata(dmz->metadata);
@@ -861,6 +862,8 @@ static void dmz_dtr(struct dm_target *ti)
 
 	dmz_put_zoned_device(ti);
 
+	mutex_destroy(&dmz->chunk_lock);
+
 	kfree(dmz);
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 73d7f316ac1d..67bf11610e4d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1789,6 +1789,10 @@ static void cleanup_mapped_device(struct mapped_device *md)
 		md->bdev = NULL;
 	}
 
+	mutex_destroy(&md->suspend_lock);
+	mutex_destroy(&md->type_lock);
+	mutex_destroy(&md->table_devices_lock);
+
 	dm_mq_cleanup_mapped_device(md);
 }
 

From 4b259fc4a8a12dcd0ffd670877a7a1ca2ab0f3e3 Mon Sep 17 00:00:00 2001
From: Ma Shimiao <mashimiao.fnst@cn.fujitsu.com>
Date: Tue, 12 Dec 2017 17:39:10 +0800
Subject: [PATCH 60/69] dm log writes: fix max length used for kstrndup

If source string is longer than max, kstrndup will allocate max+1
space.  So make sure the result will not exceed max.

Signed-off-by: Ma Shimiao <mashimiao.fnst@cn.fujitsu.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-log-writes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 189badbeddaf..3362d866793b 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -594,7 +594,7 @@ static int log_mark(struct log_writes_c *lc, char *data)
 		return -ENOMEM;
 	}
 
-	block->data = kstrndup(data, maxsize, GFP_KERNEL);
+	block->data = kstrndup(data, maxsize - 1, GFP_KERNEL);
 	if (!block->data) {
 		DMERR("Error copying mark data");
 		kfree(block);

From 050af08ffb1b62af69196d61c22a0755f9a3cdbd Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 11 Jan 2018 14:01:56 +0800
Subject: [PATCH 61/69] dm mpath: return DM_MAPIO_REQUEUE on blk-mq rq
 allocation failure

blk-mq will rerun queue via RESTART or dispatch wake after one request
is completed, so not necessary to wait random time for requeuing, we
should trust blk-mq to do it.

More importantly, we need to return BLK_STS_RESOURCE to blk-mq so that
dequeuing from the I/O scheduler can be stopped, this results in
improved I/O merging.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index be581765edd1..e0187798ccae 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -533,8 +533,20 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 		if (blk_queue_dying(q)) {
 			atomic_inc(&m->pg_init_in_progress);
 			activate_or_offline_path(pgpath);
+			return DM_MAPIO_DELAY_REQUEUE;
 		}
-		return DM_MAPIO_DELAY_REQUEUE;
+
+		/*
+		 * blk-mq's SCHED_RESTART can cover this requeue, so we
+		 * needn't deal with it by DELAY_REQUEUE. More importantly,
+		 * we have to return DM_MAPIO_REQUEUE so that blk-mq can
+		 * get the queue busy feedback (via BLK_STS_RESOURCE),
+		 * otherwise I/O merging can suffer.
+		 */
+		if (q->mq_ops)
+			return DM_MAPIO_REQUEUE;
+		else
+			return DM_MAPIO_DELAY_REQUEUE;
 	}
 	clone->bio = clone->biotail = NULL;
 	clone->rq_disk = bdev->bd_disk;

From 459b54019cfeb7330ed4863ad40f78489e0ff23d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 11 Jan 2018 14:01:55 +0800
Subject: [PATCH 62/69] dm mpath: return DM_MAPIO_DELAY_REQUEUE if QUEUE_IO or
 PG_INIT_REQUIRED

Avoid using DM_MAPIO_REQUEUE unless absolutely necessary because it
results in dm-rq.c:dm_mq_queue_rq() returning BLK_STS_RESOURCE to
blk-mq -- doing so should only ever be done if the underlying queue is
out of resources.  So switch to returning DM_MAPIO_DELAY_REQUEUE from
multipath_clone_and_map() if either MPATHF_QUEUE_IO or
MPATHF_PG_INIT_REQUIRED are set.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index e0187798ccae..815de2b091a5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -517,9 +517,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 		return DM_MAPIO_KILL;
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
 		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
-		if (pg_init_all_paths(m))
-			return DM_MAPIO_DELAY_REQUEUE;
-		return DM_MAPIO_REQUEUE;
+		pg_init_all_paths(m);
+		return DM_MAPIO_DELAY_REQUEUE;
 	}
 
 	mpio->pgpath = pgpath;

From ac514ffc968bf14649dd0e048447dc966ee49555 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 12 Jan 2018 19:53:40 -0500
Subject: [PATCH 63/69] dm mpath: delay the retry of a request if the target
 responded as busy

Add DM_ENDIO_DELAY_REQUEUE to allow request-based multipath's
multipath_end_io() to instruct dm-rq.c:dm_done() to delay a requeue.
This is beneficial to do if BLK_STS_RESOURCE is returned from the target
(because target is busy).

Relative to blk-mq: kick the hw queues via blk_mq_requeue_work(),
indirectly from dm-rq.c:__dm_mq_kick_requeue_list(), after a delay.

For old .request_fn: use blk_delay_queue().

bio-based multipath doesn't have feature parity with request-based for
retryable error requeues; that is something that'll need fixing in the
future.

Suggested-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Bart Van Assche <bart.vanassche@wdc.com>
[as interpreted from Bart's "... patch looks fine to me."]
---
 drivers/md/dm-mpath.c         | 5 ++++-
 drivers/md/dm-rq.c            | 4 ++++
 include/linux/device-mapper.h | 3 ++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 815de2b091a5..a8b1ffc0cb3d 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1585,7 +1585,10 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 	if (error && !noretry_error(error)) {
 		struct multipath *m = ti->private;
 
-		r = DM_ENDIO_REQUEUE;
+		if (error == BLK_STS_RESOURCE)
+			r = DM_ENDIO_DELAY_REQUEUE;
+		else
+			r = DM_ENDIO_REQUEUE;
 
 		if (pgpath)
 			fail_path(pgpath);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9d32f25489c2..b78ff6921cfb 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -315,6 +315,10 @@ static void dm_done(struct request *clone, blk_status_t error, bool mapped)
 		/* The target wants to requeue the I/O */
 		dm_requeue_original_request(tio, false);
 		break;
+	case DM_ENDIO_DELAY_REQUEUE:
+		/* The target wants to requeue the I/O after a delay */
+		dm_requeue_original_request(tio, true);
+		break;
 	default:
 		DMWARN("unimplemented target endio return value: %d", r);
 		BUG();
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 9ba84532947d..da83f64952e7 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -550,6 +550,7 @@ do {									\
 #define DM_ENDIO_DONE		0
 #define DM_ENDIO_INCOMPLETE	1
 #define DM_ENDIO_REQUEUE	2
+#define DM_ENDIO_DELAY_REQUEUE	3
 
 /*
  * Definitions of return values from target map function.
@@ -557,7 +558,7 @@ do {									\
 #define DM_MAPIO_SUBMITTED	0
 #define DM_MAPIO_REMAPPED	1
 #define DM_MAPIO_REQUEUE	DM_ENDIO_REQUEUE
-#define DM_MAPIO_DELAY_REQUEUE	3
+#define DM_MAPIO_DELAY_REQUEUE	DM_ENDIO_DELAY_REQUEUE
 #define DM_MAPIO_KILL		4
 
 #define dm_sector_div64(x, y)( \

From c12c9a3c3860c76ba273798c0c34c6f1294cc759 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 12 Jan 2018 09:32:21 -0500
Subject: [PATCH 64/69] dm: various cleanups to md->queue initialization code

Also, add dm_sysfs_init() error handling to dm_create().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h |  2 --
 drivers/md/dm-rq.c   |  2 --
 drivers/md/dm.c      | 30 ++++++++++++------------------
 3 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 124ffa2d6b9a..3222e21cbbf8 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -129,8 +129,6 @@ struct mapped_device {
 	struct srcu_struct io_barrier;
 };
 
-void dm_init_md_queue(struct mapped_device *md);
-void dm_init_normal_md_queue(struct mapped_device *md);
 int md_in_flight(struct mapped_device *md);
 void disable_write_same(struct mapped_device *md);
 void disable_write_zeroes(struct mapped_device *md);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index b78ff6921cfb..c59c59cfd2a5 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -704,7 +704,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
 	/* disable dm_old_request_fn's merge heuristic by default */
 	md->seq_rq_merge_deadline_usecs = 0;
 
-	dm_init_normal_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 
 	/* Initialize the request-based DM worker thread */
@@ -814,7 +813,6 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 		err = PTR_ERR(q);
 		goto out_tag_set;
 	}
-	dm_init_md_queue(md);
 
 	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
 	err = blk_mq_register_dev(disk_to_dev(md->disk), q);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 67bf11610e4d..148087932679 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1733,20 +1733,9 @@ static const struct dax_operations dm_dax_ops;
 
 static void dm_wq_work(struct work_struct *work);
 
-void dm_init_md_queue(struct mapped_device *md)
-{
-	/*
-	 * Initialize data that will only be used by a non-blk-mq DM queue
-	 * - must do so here (in alloc_dev callchain) before queue is used
-	 */
-	md->queue->queuedata = md;
-	md->queue->backing_dev_info->congested_data = md;
-}
-
-void dm_init_normal_md_queue(struct mapped_device *md)
+static void dm_init_normal_md_queue(struct mapped_device *md)
 {
 	md->use_blk_mq = false;
-	dm_init_md_queue(md);
 
 	/*
 	 * Initialize aspects of queue that aren't relevant for blk-mq
@@ -1846,10 +1835,10 @@ static struct mapped_device *alloc_dev(int minor)
 	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
 	if (!md->queue)
 		goto bad;
+	md->queue->queuedata = md;
+	md->queue->backing_dev_info->congested_data = md;
 
-	dm_init_md_queue(md);
-
-	md->disk = alloc_disk_node(1, numa_node_id);
+	md->disk = alloc_disk_node(1, md->numa_node_id);
 	if (!md->disk)
 		goto bad;
 
@@ -2082,13 +2071,18 @@ static struct dm_table *__unbind(struct mapped_device *md)
  */
 int dm_create(int minor, struct mapped_device **result)
 {
+	int r;
 	struct mapped_device *md;
 
 	md = alloc_dev(minor);
 	if (!md)
 		return -ENXIO;
 
-	dm_sysfs_init(md);
+	r = dm_sysfs_init(md);
+	if (r) {
+		free_dev(md);
+		return r;
+	}
 
 	*result = md;
 	return 0;
@@ -2145,6 +2139,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 
 	switch (type) {
 	case DM_TYPE_REQUEST_BASED:
+		dm_init_normal_md_queue(md);
 		r = dm_old_init_request_queue(md, t);
 		if (r) {
 			DMERR("Cannot initialize queue for request-based mapped device");
@@ -2236,7 +2231,6 @@ EXPORT_SYMBOL_GPL(dm_device_name);
 
 static void __dm_destroy(struct mapped_device *md, bool wait)
 {
-	struct request_queue *q = dm_get_md_queue(md);
 	struct dm_table *map;
 	int srcu_idx;
 
@@ -2247,7 +2241,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 	set_bit(DMF_FREEING, &md->flags);
 	spin_unlock(&_minor_lock);
 
-	blk_set_queue_dying(q);
+	blk_set_queue_dying(md->queue);
 
 	if (dm_request_based(md) && md->kworker_task)
 		kthread_flush_worker(&md->kworker);

From eaa160ededfad7a38f7ee06dc1af2ced1b410ad8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sat, 13 Jan 2018 14:33:30 -0500
Subject: [PATCH 65/69] dm table: fix NVMe bio-based dm_table_determine_type()
 validation

The 'verify_rq_based:' code in dm_table_determine_type() was checking
all devices in the DM table rather than only checking the data devices.
Fix this by using the immutable target's iterate_devices method.

Also, tweak the block of dm_table_determine_type() code that decides
whether to upgrade from DM_TYPE_BIO_BASED to DM_TYPE_NVME_BIO_BASED so
that it makes sure the immutable_target doesn't support require
splitting IOs.

These changes have been verified to allow a "thin-pool" target whose
data device is an NVMe device to be upgraded to DM_TYPE_NVME_BIO_BASED.
Using the thin-pool in NVMe bio-based mode was verified to pass all the
device-mapper-test-suite's "thin-provisioning" tests.

Also verified that request-based DM multipath (with queue_mode "rq" and
"mq") works as expected using the 'mptest' harness.

Fixes: 22c11858e ("dm: introduce DM_TYPE_NVME_BIO_BASED")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 57 ++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ad4ac294dd57..5fe7ec356c33 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -912,13 +912,31 @@ static bool dm_table_supports_dax(struct dm_table *t)
 
 static bool dm_table_does_not_support_partial_completion(struct dm_table *t);
 
+struct verify_rq_based_data {
+	unsigned sq_count;
+	unsigned mq_count;
+};
+
+static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
+			      sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+	struct verify_rq_based_data *v = data;
+
+	if (q->mq_ops)
+		v->mq_count++;
+	else
+		v->sq_count++;
+
+	return queue_is_rq_based(q);
+}
+
 static int dm_table_determine_type(struct dm_table *t)
 {
 	unsigned i;
 	unsigned bio_based = 0, request_based = 0, hybrid = 0;
-	unsigned sq_count = 0, mq_count = 0;
+	struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0};
 	struct dm_target *tgt;
-	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
 	enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
 
@@ -972,11 +990,15 @@ static int dm_table_determine_type(struct dm_table *t)
 		if (dm_table_supports_dax(t) ||
 		    (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
 			t->type = DM_TYPE_DAX_BIO_BASED;
-		} else if ((dm_table_get_immutable_target(t) &&
-			    dm_table_does_not_support_partial_completion(t)) ||
-			   (list_empty(devices) && live_md_type == DM_TYPE_NVME_BIO_BASED)) {
-			t->type = DM_TYPE_NVME_BIO_BASED;
-			goto verify_rq_based;
+		} else {
+			/* Check if upgrading to NVMe bio-based is valid or required */
+			tgt = dm_table_get_immutable_target(t);
+			if (tgt && !tgt->max_io_len && dm_table_does_not_support_partial_completion(t)) {
+				t->type = DM_TYPE_NVME_BIO_BASED;
+				goto verify_rq_based; /* must be stacked directly on NVMe (blk-mq) */
+			} else if (list_empty(devices) && live_md_type == DM_TYPE_NVME_BIO_BASED) {
+				t->type = DM_TYPE_NVME_BIO_BASED;
+			}
 		}
 		return 0;
 	}
@@ -1025,25 +1047,16 @@ verify_rq_based:
 	}
 
 	/* Non-request-stackable devices can't be used for request-based dm */
-	list_for_each_entry(dd, devices, list) {
-		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
-
-		if (!queue_is_rq_based(q)) {
-			DMERR("table load rejected: including"
-			      " non-request-stackable devices");
-			return -EINVAL;
-		}
-
-		if (q->mq_ops)
-			mq_count++;
-		else
-			sq_count++;
+	if (!tgt->type->iterate_devices ||
+	    !tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) {
+		DMERR("table load rejected: including non-request-stackable devices");
+		return -EINVAL;
 	}
-	if (sq_count && mq_count) {
+	if (v.sq_count && v.mq_count) {
 		DMERR("table load rejected: not all devices are blk-mq request-stackable");
 		return -EINVAL;
 	}
-	t->all_blk_mq = mq_count > 0;
+	t->all_blk_mq = v.mq_count > 0;
 
 	if (!t->all_blk_mq &&
 	    (t->type == DM_TYPE_MQ_REQUEST_BASED || t->type == DM_TYPE_NVME_BIO_BASED)) {

From bd6d1e0a5fafd47acb4ca5ca51d5dc8c7563112c Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@kernel.org>
Date: Wed, 17 Jan 2018 15:09:25 +0000
Subject: [PATCH 66/69] dm thin: fix trailing semicolon in
 __remap_and_issue_shared_cell

The trailing semicolon is an empty statement that does no operation.
Removing it since it doesn't do anything.

Signed-off-by: Luis de Bethencourt <luisbg@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c1c6160be355..629c555890c1 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1722,7 +1722,7 @@ static void __remap_and_issue_shared_cell(void *context,
 		    bio_op(bio) == REQ_OP_DISCARD)
 			bio_list_add(&info->defer_bios, bio);
 		else {
-			struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
+			struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 
 			h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
 			inc_all_io_entry(info->tc->pool, bio);

From cc65661981ae2424e27c695ae8d15604448eb666 Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Tue, 23 Jan 2018 10:55:18 -0700
Subject: [PATCH 67/69] dm unstripe: fix target length versus number of stripes
 size check

Since the unstripe target takes a target length which is the
size of *one* striped member we're trying to expose, not the
total size of *all* the striped members, the check does not
make sense and fails for some striped setups.

For example, say we have a 4TB striped device:
or 3907018496 sectors per underlying device:

if (sector_div(width, uc->stripes)) :
   3907018496 / 2(num stripes)  == 1953509248

tmp_len = width;
if (sector_div(tmp_len, uc->chunk_size)) :
   1953509248 / 256(chunk size) == 7630895.5
   (fails)

Fix this by removing the first check which isn't valid for unstriping.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-unstripe.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c
index 061b4f10bf5c..65f838fa2e99 100644
--- a/drivers/md/dm-unstripe.c
+++ b/drivers/md/dm-unstripe.c
@@ -44,7 +44,7 @@ static void cleanup_unstripe(struct unstripe_c *uc, struct dm_target *ti)
 static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct unstripe_c *uc;
-	sector_t width, tmp_len;
+	sector_t tmp_len;
 	unsigned long long start;
 	char dummy;
 
@@ -100,13 +100,7 @@ static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	uc->unstripe_width = (uc->stripes - 1) * uc->chunk_size;
 	uc->chunk_shift = fls(uc->chunk_size) - 1;
 
-	width = ti->len;
-	if (sector_div(width, uc->stripes)) {
-		ti->error = "Target length not divisible by number of stripes";
-		goto err;
-	}
-
-	tmp_len = width;
+	tmp_len = ti->len;
 	if (sector_div(tmp_len, uc->chunk_size)) {
 		ti->error = "Target length not divisible by chunk size";
 		goto err;

From f20426056f2eba3f0379779f0a75722e41bc28da Mon Sep 17 00:00:00 2001
From: Khazhismel Kumykov <khazhy@google.com>
Date: Fri, 19 Jan 2018 15:07:37 -0800
Subject: [PATCH 68/69] dm mpath selector: more evenly distribute ties

Move the last used path to the end of the list (least preferred) so that
ties are more evenly distributed.

For example, in case with three paths with one that is slower than
others, the remaining two would be unevenly used if they tie. This is
due to the rotation not being a truely fair distribution.

Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are 'tied'
Three possible rotations:
(a, b, c) -> best path 'a'
(b, c, a) -> best path 'b'
(c, a, b) -> best path 'a'
(a, b, c) -> best path 'a'
(b, c, a) -> best path 'b'
(c, a, b) -> best path 'a'
...

So 'a' is used 2x more than 'b', although they should be used evenly.

With this change, the most recently used path is always the least
preferred, removing this bias resulting in even distribution.
(a, b, c) -> best path 'a'
(b, c, a) -> best path 'b'
(c, a, b) -> best path 'a'
(c, b, a) -> best path 'b'
...

Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
Reviewed-by: Martin Wilck <mwilck@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-queue-length.c | 6 +++---
 drivers/md/dm-service-time.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 23f178641794..969c4f1a3633 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -195,9 +195,6 @@ static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes)
 	if (list_empty(&s->valid_paths))
 		goto out;
 
-	/* Change preferred (first in list) path to evenly balance. */
-	list_move_tail(s->valid_paths.next, &s->valid_paths);
-
 	list_for_each_entry(pi, &s->valid_paths, list) {
 		if (!best ||
 		    (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
@@ -210,6 +207,9 @@ static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes)
 	if (!best)
 		goto out;
 
+	/* Move most recently used to least preferred to evenly balance. */
+	list_move_tail(&best->list, &s->valid_paths);
+
 	ret = best->path;
 out:
 	spin_unlock_irqrestore(&s->lock, flags);
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 7b8642045c55..f006a9005593 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -282,9 +282,6 @@ static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes)
 	if (list_empty(&s->valid_paths))
 		goto out;
 
-	/* Change preferred (first in list) path to evenly balance. */
-	list_move_tail(s->valid_paths.next, &s->valid_paths);
-
 	list_for_each_entry(pi, &s->valid_paths, list)
 		if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
 			best = pi;
@@ -292,6 +289,9 @@ static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes)
 	if (!best)
 		goto out;
 
+	/* Move most recently used to least preferred to evenly balance. */
+	list_move_tail(&best->list, &s->valid_paths);
+
 	ret = best->path;
 out:
 	spin_unlock_irqrestore(&s->lock, flags);

From 9614e2ba9161c7f5419f4212fa6057d2a65f6ae6 Mon Sep 17 00:00:00 2001
From: John Pittman <jpittman@redhat.com>
Date: Tue, 30 Jan 2018 16:39:00 -0500
Subject: [PATCH 69/69] dm cache: Documentation: update default
 migration_throttling value

In commit f8350daf7af0 ("dm cache: tune migration throttling") the
value for DEFAULT_MIGRATION_THRESHOLD was decreased from 204800 to
2048.  Edit device-mapper/cache.txt to reflect the correct default
value for migration_threshold.

Signed-off-by: John Pittman <jpittman@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/cache.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index 79c7b6dc88ae..ff0841711fd5 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -119,7 +119,7 @@ doing here to avoid migrating during those peak io moments.
 
 For the time being, a message "migration_threshold <#sectors>"
 can be used to set the maximum number of sectors being migrated,
-the default being 204800 sectors (100MB).
+the default being 2048 sectors (1MB).
 
 Updating on-disk metadata
 -------------------------